np.array
and the low-level constructor
np.ndarray
ndarray
on GitHub
The pandas library is built on top of NumPy. Open a macOS Terminal window and say
pip3 --version pip 19.3.1 from /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pip (python 3.8) pip3 search numpy numpy (1.17.4) - NumPy is the fundamental package for array computing with Python. pip3 install numpy pip3 list pip3 show numpy Name: numpy Version: 1.17.4 Summary: NumPy is the fundamental package for array computing with Python. Home-page: https://www.numpy.org Author: Travis E. Oliphant et al. Author-email: None License: BSD Location: /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages Requires: Required-by: scipy, pandas, matplotlib
See
np.array2string
and
np.set_printoptions
.
""" Create a one-dimensional NumPy ndarray and demonstrate that it is iterable. """ import sys import numpy as np data = [10, 20, 30, 40, 50] #Python list of ints nd = np.array(data) print(f"{nd = }") #in Python 3.7, this was print(f"nd = {nd}"). print(f"{type(nd) = }") print() print(f"{len(nd) = }") print(f"{40 in nd = }") print(f"{60 in nd = }") for n in nd: print(n) print() for i, n in enumerate(nd): print(i, n) sys.exit(0)
nd = array([10, 20, 30, 40, 50]) type(nd) = <class 'numpy.ndarray'> len(nd) = 5 40 in nd = True 60 in nd = False 10 20 30 40 50 0 10 1 20 2 30 3 40 4 50
See
""" Index and slice an ndarray. """ import sys import numpy as np data = [10, 20, 30, 40, 50] nd = np.array(data) print(f"{nd = }") print() #index print(f"{nd[0] = }") #first element print(f"{nd[4] = }") #fifth element print(f"{nd[-1] = }") #last element print() #slice print(f"{nd[:3] = }") #first three elements print(f"{nd[1:-1] = }") #all but the first and last elements print(f"{nd[-3:] = }") #last three elements print() #a slice with an explicit stride print(f"{nd[::2] = }") #just the elements at the even positions print(f"{nd[1::2] = }") #just the elements at the odd positions print(f"{nd[::-1] = }") #all the elements, from last to first sys.exit(0)
nd = array([10, 20, 30, 40, 50]) nd[0] = 10 nd[4] = 50 nd[-1] = 50 nd[:3] = array([10, 20, 30]) nd[1:4] = array([20, 30, 40]) nd[-3:] = array([30, 40, 50]) nd[::2] = array([10, 30, 50]) nd[1::2] = array([20, 40]) nd[::-1] = array([50, 40, 30, 20, 10])
import sys import numpy as np data = [10, 20, 30, 40, 50] #a list of ints nd = np.array(data) print(f"{nd = }") print() print(f"{nd.dtype = }") print(f"{nd.dtype.name = }") print(f"{nd.dtype.itemsize = }") print() print(f"{nd.size = }") #number of elements print(f"{nd.itemsize = }") print(f"{nd.nbytes = }") print(f"{sys.getsizeof(nd) = }") sys.exit(0)
nd
contains 5 × 8 = 40 bytes of data,
plus 96 bytes of overhead,
for a total of 136 bytes.
nd = array([10, 20, 30, 40, 50]) nd.dtype = dtype('int64') nd.dtype.name = 'int64' nd.dtype.itemsize = 8 nd.size = 5 nd.itemsize = 8 nd.nbytes = 40 sys.getsizeof(nd) = 136
The same program,
but with a
list
of
float
s:
import sys import numpy as np data = [10.0, 20.0, 30.0, 40.0, 50.0] #a list of floats nd = np.array(data) print(f"{nd = }") print() print(f"{nd.dtype = }") print(f"{nd.dtype.name = }") print(f"{nd.dtype.itemsize = }") print() print(f"{nd.size = }") #number of elements print(f"{nd.itemsize = }") print(f"{nd.nbytes = }") print(f"{sys.getsizeof(nd) = }") sys.exit(0)
Once again,
nd
contains 5 × 8 = 40 bytes of data,
plus 96 bytes of overhead,
for a total of 136 bytes.
nd = array([10., 20., 30., 40., 50.]) nd.dtype = dtype('float64') nd.dtype.name = 'float64' nd.dtype.itemsize = 8 nd.size = 5 nd.itemsize = 8 nd.nbytes = 40 sys.getsizeof(nd) = 136
""" Use only half as much memory. """ import sys import numpy as np data = [10, 20, 30, 40, 50] def printer(nd): print(f"{nd = }") print(f"{nd.dtype.name = }") print(f"{nd.dtype.itemsize = }") print(f"{nd.size = }") print(f"{nd.itemsize = }") print(f"{nd.nbytes = }") print(f"{sys.getsizeof(nd) = }") nd64 = np.array(data) printer(nd64) print() nd32 = np.array(data, dtype = np.int32) printer(nd32) sys.exit(0)
nd = array([10, 20, 30, 40, 50]) nd.dtype.name = 'int64' nd.dtype.itemsize = 8 nd.size = 5 nd.itemsize = 8 nd.nbytes = 40 sys.getsizeof(nd) = 136 nd = array([10, 20, 30, 40, 50], dtype=int32) nd.dtype.name = 'int32' nd.dtype.itemsize = 4 nd.size = 5 nd.itemsize = 4 nd.nbytes = 20 sys.getsizeof(nd) = 116
If the data was
data = 1_000_000 * [10] #a list of one million intshow much memory would you save by changing the
dtype
from
np.int64
to
np.int32
?
number of possible values | signed | unsigned | |
---|---|---|---|
8-bit
(one byte) |
256 = 28 |
np.int8
minimum: −128 = −27 maximum: 127 = 27 − 1 |
np.uint8
minimum: 0 maximum: 255 = 28 − 1 |
16-bit
(two bytes) |
65,536 = 216 |
np.int16
minimum: −32,768 = −215 maximum: 32,767 = 215 − 1 |
np.uint16
minimum: 0 maximum: 65,535 = 216 − 1 |
32-bit
(four bytes) |
4,294,967,296 = 232 |
np.int32
minimum: −2,147,483,648 = −231 maximum: 2,147,483,647 = 231 − 1 |
np.uint32
minimum: 0 maximum: 4,294,967,295 = 232 − 1 |
64-bit
(8 bytes) |
18,446,744,073,709,551,616 = 264 |
np.int64
minimum: −9,223,372,036,854,775,808 = −263 maximum: 9,223,372,036,854,775,807 = 263 − 1 |
np.uint64
minimum: 0 maximum: 18,446,744,073,709,551,615 = 264 − 1 |
A plain old Python
int
can hold bigger and bigger numbers
until the computer
runs
out of memory.
import sys i = 9_223_372_036_854_775_807 #2**63 - 1 print(f"{i = :,}") i = 9_223_372_036_854_775_808 print(f"{i = :,}") sys.exit(0)
i = 9,223,372,036,854,775,807 i = 9,223,372,036,854,775,808
But a
np.int64
cannot hold values larger than
263 − 1.
import sys import numpy as np i = np.int64(9_223_372_036_854_775_807) #2**63 - 1 print(f"{i = :,}") i = np.int64(9_223_372_036_854_775_808) print(f"{i = :,}") print() sys.exit(0)
i = 9,223,372,036,854,775,807 Traceback (most recent call last): File "/Users/myname/python/prog.py", line 7, in <module> i = 9_223_372_036_854_775_808 OverflowError: Python int too large to convert to C long
Let m be the number of bits in the mantissa of the floating point number. Then the number of guaranteed significant decimal digits is
Alas, on my macOS Catalina 10.15.1,
the vaunted
np.float128
is merely an 80-bit float followed by 48 bits of padding.
name | number of
bits in mantissa | number of
significant digits |
|
---|---|---|---|
16-bit
(two bytes) |
np.float16 |
11 | 3 |
32-bit
(four bytes) |
np.float32 |
24 | 7 |
64-bit
(eight bytes) |
np.float64 |
53 | 15 |
128-bit
(sixteen bytes) |
np.float128 |
64 | 19 |
An
np.float64
is the same as a Python
float
(and the same as a C or C++
double
.)
""" How close can the various floating point data types get to the fraction 1/3? """ import sys import numpy as np dataTypes = [ np.float16, np.float32, float, #the plain old Python float np.float64, #same as the plain old Python float np.float128 ] np.set_printoptions(precision = 20) for dataType in dataTypes: oneThird = dataType(1) / dataType(3) nd = np.array([oneThird]) #np.ndarray containing only one element s = str(nd) s = s.strip("[]") count = s.count("3") print(f"{dataType.__name__:8} {s:22} ({count:2} threes)") sys.exit(0)
float16 0.3333 ( 4 threes) float32 0.33333334 ( 7 threes) float 0.3333333333333333 (16 threes) float64 0.3333333333333333 (16 threes) float128 0.33333333333333333334 (19 threes)
""" How many bytes are occupied by 100 million floats? """ import sys import array import numpy as np n = 100_000_000 print("list:") li = [float(i) for i in range(n)] #list comprehension print(f"{len(li) = :11,}") print(f"{sys.getsizeof(li) = :11,}") print(f"{sys.getsizeof(li) / len(li) = }") print() print("tuple:") tu = tuple(li) print(f"{len(tu) = :11,}") print(f"{sys.getsizeof(tu) = :11,}") print() print("array.array:") ar = array.array("d", li) print(f"{len(ar) = :11,}") print(f"{ar.itemsize = :11,}") print(f"{sys.getsizeof(ar) = :11,}") print() print("np.ndarray:") nd = np.array(np.arange(n, dtype = np.float64)) #"array range" print(f"{nd.size = :11,}") print(f"{nd.itemsize = :11,}") print(f"{nd.nbytes = :11,}") print(f"{sys.getsizeof(nd) = :11,}") sys.exit(0)
list: len(li) = 100,000,000 sys.getsizeof(li) = 859,724,464 sys.getsizeof(li) / len(li) = 8.59724464 tuple: len(tu) = 100,000,000 sys.getsizeof(tu) = 800,000,040 array.array: len(ar) = 100,000,000 ar.itemsize = 8 sys.getsizeof(ar) = 800,000,064 np.ndarray: nd.size = 100,000,000 nd.itemsize = 8 nd.nbytes = 800,000,000 sys.getsizeof(nd) = 800,000,096
You don’t have to memorize these examples.
Their purpose is to reassure you that you can probably create your
np.ndarry
any way you want to.
See
Array creation
and
Array
creation routines.
import sys import numpy as np oneTrain = [242, 238, 231, 225, 215, 207] #a list a = np.array(oneTrain) print(f"{a = }") sys.exit(0)
a = array([242, 238, 231, 225, 215, 207])
import sys import numpy as np a = np.zeros(10, dtype = np.int64) print(f"{a = }") a = np.zeros(10, dtype = np.float64) print(f"{a = }") sys.exit(0)
a = array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) a = array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
import sys import numpy as np a = np.ones(10, dtype = np.int64) print(f"{a = }") a = np.ones(10, dtype = np.float64) print(f"{a = }") sys.exit(0)
a = array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) a = array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
import sys import numpy as np a = np.full(10, fill_value = 100, dtype = np.int64) print(f"{a = }") sys.exit(0)
a = array([100, 100, 100, 100, 100, 100, 100, 100, 100, 100])
import sys import numpy as np a = np.arange(10, 60, 10, dtype = np.int64) #like range print(f"{a = }") a = np.arange(10.0, 60.0, 10.0, dtype = np.float64) print(f"{a = }") sys.exit(0)
a = array([10, 20, 30, 40, 50]) a = array([10., 20., 30., 40., 50.])
import sys import numpy as np a = np.linspace(0.0, 100.0, 4) print(f"{a = }") sys.exit(0)
a = array([ 0. , 33.33333333, 66.66666667, 100. ])
You can then overwrite the garbage values; see the “surcharge” comment below.
import sys import numpy as np a = np.empty(10, dtype = np.int64) #unpredictable values print(f"{a = }") print() a = np.empty(10, dtype = np.float64) print(f"{a = }") sys.exit(0)
a = array([-2305843009213693952, -2305843009213693952, 140572591521797, 140572591631200, 140572603403360, 140572602095536, 140572602284256, 140572602277824, 140572602257280, 1547947485859520]) a = array([-2.68156159e+154, -1.49457815e-154, 2.47032823e-323, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 6.95335581e-309])
import sys import numpy as np a = np.array([], dtype = np.int64) #Create an empty ndarray, of length 0. print(f"{a = }") a = np.append(a, 10) #See also np.insert, np.delete, etc. print(f"{a = }") a = np.append(a, [20, 30]) print(f"{a = }") sys.exit(0)
a = array([], dtype=int64) a = array([10]) a = array([10, 20, 30])
import sys import numpy as np a = np.random.uniform(size = 5, low = 0.00, high = 100.00) print(f"{a = }") a = np.around(a, 2) #Round to the nearest cent. print(f"{a = }") sys.exit(0)
a = array([47.92096656, 1.79325858, 12.29409195, 1.03928249, 62.07859764]) a = array([47.92, 1.79, 12.29, 1.04, 62.08])
import sys import numpy as np a = np.genfromtxt("infile.txt", dtype = np.int64) print(f"{a = }") sys.exit(0)
a = array([10, 20, 30, 40, 50])
import sys
import numpy as np
prices = np.array([10.00, 20.00, 30.00, 40.00, 50.00], dtype = np.float64)
print(f"{prices = }")
#Three ways to inflict six percent inflation.
#1. Change every value in the np.ndarray.
#for i in range(len(prices)):
# prices[i] *= 1.06 #could also say prices[i] = prices[i] * 1.06
#2. Create a new list. Then create a new np.ndarray and discard the old one.
#prices = np.array([price * 1.06 for price in prices])
#3. Use a NumPy vectorized operation to change every value in the np.ndarray.
prices *= 1.06 #could also say prices = prices * 1.06
print(f"{prices = }")
print()
for price in prices:
print(f"${price:.2f}") #Print money with 2 digits to the right of the decimal point.
print()
dictionary = {"float": lambda price: f"${price:.2f}"}
s = np.array2string(prices, formatter = dictionary, separator = "\n")
print(s.strip("[]"))
sys.exit(0)
prices = array([10., 20., 30., 40., 50.]) prices = array([10.6, 21.2, 31.8, 42.4, 53. ]) $10.60 $21.20 $31.80 $42.40 $53.00 $10.60 $21.20 $31.80 $42.40 $53.00
Try each of the following vectorized operations.
#Modify the existing np.ndarray. prices *= 1.06 #could also say prices = prices * 1.06 prices += 1.99 #could also say prices = prices + 1.99 prices **= .5 #could also say prices = prices ** .5 #take the square root prices = np.sqrt(prices) #also try np.log, np.exp, etc. prices = -prices prices = 1 / prices #Create a new np.ndarray. newPrices = prices + 1.99 newPrices = prices.astype(np.float128)
"Create a third np.ndarray from two existing ones." import sys import numpy as np prices = np.array([10.00, 20.00, 30.00, 40.00, 50.00]) #dtype defaults to np.float64 surcharges = np.array([ 1.00, 1.00, 2.00, 2.00, 3.50]) #Three ways to add the two np.ndarrays together. #total = np.empty(len(prices), dtype = np.float64) #born full of garbage #for i in range(len(total)): # total[i] = prices[i] + surcharges[i] #total = np.array([], dtype = np.float64) #for price, surcharge in zip(prices, surcharges): # total = np.append(total, price + surcharge) total = prices + surcharges print(f"{total = }") #print(f"{np.around(total) = }") #Round to closest dollar. #print(f"{np.around(total, 1) = }") #Round to closest dime. #print(f"{np.around(total, -1) = }") #Round to closest 10 dollars. sys.exit(0)
total = array([11. , 21. , 32. , 42. , 53.5])
"Create an np.ndarray of bools." import sys import numpy as np prices = np.array([10.00, 20.00, 30.00, 40.00, 50.00]) expensive = prices > 25.00 print(f"{expensive = }") print(f"{type(expensive) = }") print(f"{expensive.dtype.name = }") print(f"{expensive.dtype.itemsize = }") sys.exit(0)
expensive = array([False, False, True, True, True]) type(expensive) = <class 'numpy.ndarray'> expensive.dtype.name = 'bool' expensive.dtype.itemsize = 1
import sys import timeit n = 10_000_000 #how many prices repeat = 4 #Do the experiment 4 times. number = 1 #Each experiment consists of 1 execution of the code. setup = f"""\ import numpy as np prices = np.random.uniform(size = {n}, low = 0.00, high = 100.00) prices = np.around(prices, 2)""" #Round to closest cent. code = "prices *= 1.06" s = timeit.repeat(code, setup = setup, repeat = repeat, number = number) print(f"{s} seconds") minArray = min(s) print(f"{minArray} seconds is the minimum.") print() setup = f"""\ import random prices = [round(random.uniform(0.00, 100.00), 2) for _ in range({n})]""" code = "for i in range(len(prices)): prices[i] *= 1.06" s = timeit.repeat(code, setup = setup, repeat = repeat, number = number) print(f"{s} seconds") minList = min(s) print(f"{minList} seconds is the minimum.") print() print(f"The np.ndarray is {minList / minArray} times faster than the list.") sys.exit(0)
[0.012434812999999822, 0.005454579000000015, 0.005510028999999861, 0.005391932999999849] seconds 0.005391932999999849 seconds is the minimum. [0.8033366490000002, 0.7729674679999992, 0.7890182360000004, 0.7625401109999999] seconds 0.7625401109999999 seconds is the minimum. The np.ndarray is 141.42240102019466 times faster than the list.
ndarray
.
import sys import functools import operator import numpy as np data = [ #a list of lists [ 0, 1, 2, 3], [10, 11, 12, 13], [20, 21, 22, 23] ] nd = np.array(data) print(f"{nd = }") print() for row in nd: for i in row: print(f"{i:2} ", end = "") print() print() print(f"{len(nd) = }") #number of rows print(f"{nd.shape = }") #a tuple print(f"{len(nd.shape) = }") #number of dimensions print(f"{nd.ndim = }") #simpler way to get the number of dimensions print() product = 1 for i in nd.shape: product *= i print(f"total number of elements = {product}") #Simpler way to get the total number of elements. product = functools.reduce(operator.mul, nd.shape, 1) print(f"total number of elements = {product}") #Even simpler way to get the total number of elements. print(f"total number of elements = {np.prod(nd.shape)}") #Simplest way to get the total number of elements. print(f"total number of elements = {nd.size}") print() print(f"{nd.itemsize = }") print(f"{nd.nbytes = }") print(f"{sys.getsizeof(nd) = }") sys.exit(0)
nd
contains 3 × 4 × 8 = 96 bytes of data,
plus 112 bytes of overhead, for a total of 208 bytes.
nd = array([[ 0, 1, 2, 3], [10, 11, 12, 13], [20, 21, 22, 23]]) 0 1 2 3 10 11 12 13 20 21 22 23 len(nd) = 3 nd.shape = (3, 4) len(nd.shape) = 2 nd.ndim = 2 total number of elements = 12 total number of elements = 12 total number of elements = 12 total number of elements = 12 nd.itemsize = 8 nd.nbytes = 96 sys.getsizeof(nd) = 208
np.ndarray
?
You’ll have to add another
for
loop.
Which way would you prefer to type the data?
data = [ [ [ 0, 1, 2, 3], [ 10, 11, 12, 13], [ 20, 21, 22, 23] ], [ [100, 101, 102, 103], [110, 111, 112, 113], [120, 121, 122, 123] ], ]
data = [ [[ 0, 1, 2, 3], [ 10, 11, 12, 13], [ 20, 21, 22, 23]], [[100, 101, 102, 103], [110, 111, 112, 113], [120, 121, 122, 123]], ]