Class numpy.ndarray

Documentation

  1. from the Quickstart tutorial
    1. The Basics
    2. Array Creation
  2. from NumPy Basics
    1. Array Creation, including the function np.array and the low-level constructor np.ndarray
  3. from the NumPy Reference
    1. Array objects
    2. The N-dimensional array (ndarray)
  4. from the NumPy Glossary
    1. array
  5. ndarray on GitHub

Install the numpy module.

The pandas library is built on top of NumPy. Open a macOS Terminal window and say

pip3 --version
pip 19.3.1 from /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pip (python 3.8)

pip3 search numpy
numpy (1.17.4)                         - NumPy is the fundamental package for
                                         array computing with Python.

pip3 install numpy
pip3 list

pip3 show numpy
Name: numpy
Version: 1.17.4
Summary: NumPy is the fundamental package for array computing with Python.
Home-page: https://www.numpy.org
Author: Travis E. Oliphant et al.
Author-email: None
License: BSD
Location: /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages
Requires:
Required-by: scipy, pandas, matplotlib

Create a one-dimensional np.ndarray and demonstrate that it is iterable.

See np.array2string and np.set_printoptions.

"""
Create a one-dimensional NumPy ndarray and demonstrate that it is iterable.
"""

import sys
import numpy as np

data = [10, 20, 30, 40, 50] #Python list of ints
nd = np.array(data)
print(f"{nd = }")           #in Python 3.7, this was print(f"nd = {nd}").
print(f"{type(nd) = }")
print()

print(f"{len(nd) = }")
print(f"{40 in nd = }")
print(f"{60 in nd = }")

for n in nd:
    print(n)
print()

for i, n in enumerate(nd):
    print(i, n)

sys.exit(0)
nd = array([10, 20, 30, 40, 50])
type(nd) = <class 'numpy.ndarray'>

len(nd) = 5
40 in nd = True
60 in nd = False

10
20
30
40
50

0 10
1 20
2 30
3 40
4 50

An np.ndarray can be indexed and sliced.

See

  1. Indexing, Slicing, and Iterating
  2. Indexing
  3. Indexing
"""
Index and slice an ndarray.
"""

import sys
import numpy as np

data = [10, 20, 30, 40, 50]
nd = np.array(data)
print(f"{nd = }")
print()

#index

print(f"{nd[0] = }")    #first element
print(f"{nd[4] = }")    #fifth element
print(f"{nd[-1] = }")   #last element
print()

#slice

print(f"{nd[:3] = }")   #first three elements
print(f"{nd[1:-1] = }") #all but the first and last elements
print(f"{nd[-3:] = }")  #last three elements
print()

#a slice with an explicit stride

print(f"{nd[::2] = }")  #just the elements at the even positions
print(f"{nd[1::2] = }") #just the elements at the odd positions
print(f"{nd[::-1] = }") #all the elements, from last to first

sys.exit(0)
nd = array([10, 20, 30, 40, 50])

nd[0] = 10
nd[4] = 50
nd[-1] = 50

nd[:3] = array([10, 20, 30])
nd[1:4] = array([20, 30, 40])
nd[-3:] = array([30, 40, 50])

nd[::2] = array([10, 30, 50])
nd[1::2] = array([20, 40])
nd[::-1] = array([50, 40, 30, 20, 10])

Each np.ndarray has a dtype.

import sys
import numpy as np

data = [10, 20, 30, 40, 50]   #a list of ints
nd = np.array(data)
print(f"{nd = }")
print()

print(f"{nd.dtype = }")
print(f"{nd.dtype.name = }")
print(f"{nd.dtype.itemsize = }")
print()

print(f"{nd.size = }")   #number of elements
print(f"{nd.itemsize = }")
print(f"{nd.nbytes = }")
print(f"{sys.getsizeof(nd) = }")

sys.exit(0)

nd contains 5 × 8 = 40 bytes of data, plus 96 bytes of overhead, for a total of 136 bytes.

nd = array([10, 20, 30, 40, 50])

nd.dtype = dtype('int64')
nd.dtype.name = 'int64'
nd.dtype.itemsize = 8

nd.size = 5
nd.itemsize = 8
nd.nbytes = 40
sys.getsizeof(nd) = 136

The same program, but with a list of floats:

import sys
import numpy as np

data = [10.0, 20.0, 30.0, 40.0, 50.0]   #a list of floats
nd = np.array(data)
print(f"{nd = }")
print()

print(f"{nd.dtype = }")
print(f"{nd.dtype.name = }")
print(f"{nd.dtype.itemsize = }")
print()

print(f"{nd.size = }")   #number of elements
print(f"{nd.itemsize = }")
print(f"{nd.nbytes = }")
print(f"{sys.getsizeof(nd) = }")

sys.exit(0)

Once again, nd contains 5 × 8 = 40 bytes of data, plus 96 bytes of overhead, for a total of 136 bytes.

nd = array([10., 20., 30., 40., 50.])

nd.dtype = dtype('float64')
nd.dtype.name = 'float64'
nd.dtype.itemsize = 8

nd.size = 5
nd.itemsize = 8
nd.nbytes = 40
sys.getsizeof(nd) = 136

Specify the dtype of an np.ndarray.

"""
Use only half as much memory.
"""

import sys
import numpy as np

data = [10, 20, 30, 40, 50]

def printer(nd):
    print(f"{nd = }")
    print(f"{nd.dtype.name = }")
    print(f"{nd.dtype.itemsize = }")
    print(f"{nd.size = }")
    print(f"{nd.itemsize = }")
    print(f"{nd.nbytes = }")
    print(f"{sys.getsizeof(nd) = }")

nd64 = np.array(data)
printer(nd64)
print()

nd32 = np.array(data, dtype = np.int32)
printer(nd32)
sys.exit(0)
nd = array([10, 20, 30, 40, 50])
nd.dtype.name = 'int64'
nd.dtype.itemsize = 8
nd.size = 5
nd.itemsize = 8
nd.nbytes = 40
sys.getsizeof(nd) = 136

nd = array([10, 20, 30, 40, 50], dtype=int32)
nd.dtype.name = 'int32'
nd.dtype.itemsize = 4
nd.size = 5
nd.itemsize = 4
nd.nbytes = 20
sys.getsizeof(nd) = 116

If the data was

data = 1_000_000 * [10]   #a list of one million ints
how much memory would you save by changing the dtype from np.int64 to np.int32?

The integer dtypes

number of possible values signed unsigned
8-bit
(one byte)
256 = 28 np.int8
minimum: −128 = −27
maximum:   127 = 27 − 1
np.uint8
minimum: 0
maximum: 255 = 28 − 1
16-bit
(two bytes)
65,536 = 216 np.int16
minimum: −32,768 = −215
maximum:   32,767 = 215 − 1
np.uint16
minimum: 0
maximum: 65,535 = 216 − 1
32-bit
(four bytes)
4,294,967,296 = 232 np.int32
minimum: −2,147,483,648 = −231
maximum:   2,147,483,647 = 231 − 1
np.uint32
minimum: 0
maximum: 4,294,967,295 = 232 − 1
64-bit
(8 bytes)
18,446,744,073,709,551,616 = 264 np.int64
minimum: −9,223,372,036,854,775,808 = −263
maximum:   9,223,372,036,854,775,807 = 263 − 1
np.uint64
minimum: 0
maximum: 18,446,744,073,709,551,615 = 264 − 1

A plain old Python int can hold bigger and bigger numbers until the computer runs out of memory.

import sys

i = 9_223_372_036_854_775_807   #2**63 - 1
print(f"{i = :,}")

i = 9_223_372_036_854_775_808
print(f"{i = :,}")

sys.exit(0)
i = 9,223,372,036,854,775,807
i = 9,223,372,036,854,775,808

But a np.int64 cannot hold values larger than 263 − 1.

import sys
import numpy as np

i = np.int64(9_223_372_036_854_775_807)   #2**63 - 1
print(f"{i = :,}")

i = np.int64(9_223_372_036_854_775_808)
print(f"{i = :,}")
print()

sys.exit(0)
i = 9,223,372,036,854,775,807
Traceback (most recent call last):
  File "/Users/myname/python/prog.py", line 7, in <module>
    i = 9_223_372_036_854_775_808
OverflowError: Python int too large to convert to C long

The floating point dtypes

Let m be the number of bits in the mantissa of the floating point number. Then the number of guaranteed significant decimal digits is

log10 (2m),
rounded down to the next while number. For example,
log10 (253) ≈ 15.95458977019100334638

Alas, on my macOS Catalina 10.15.1, the vaunted np.float128 is merely an 80-bit float followed by 48 bits of padding.

name number of
bits in mantissa
number of
significant digits
16-bit
(two bytes)
np.float16 11 3
32-bit
(four bytes)
np.float32 24 7
64-bit
(eight bytes)
np.float64 53 15
128-bit
(sixteen bytes)
np.float128 64 19

An np.float64 is the same as a Python float (and the same as a C or C++ double.)

"""
How close can the various floating point data types get to the fraction 1/3?
"""

import sys
import numpy as np

dataTypes = [
    np.float16,
    np.float32,
    float,        #the plain old Python float
    np.float64,   #same as the plain old Python float
    np.float128
]

np.set_printoptions(precision = 20)

for dataType in dataTypes:
    oneThird = dataType(1) / dataType(3)
    nd = np.array([oneThird])   #np.ndarray containing only one element
    s = str(nd)
    s = s.strip("[]")
    count = s.count("3")
    print(f"{dataType.__name__:8} {s:22} ({count:2} threes)")

sys.exit(0)
float16  0.3333                 ( 4 threes)
float32  0.33333334             ( 7 threes)
float    0.3333333333333333     (16 threes)
float64  0.3333333333333333     (16 threes)
float128 0.33333333333333333334 (19 threes)

Memory usage

"""
How many bytes are occupied by 100 million floats?
"""

import sys
import array
import numpy as np

n = 100_000_000

print("list:")
li = [float(i) for i in range(n)]   #list comprehension
print(f"{len(li)           = :11,}")
print(f"{sys.getsizeof(li) = :11,}")
print(f"{sys.getsizeof(li) / len(li) = }")
print()

print("tuple:")
tu = tuple(li)
print(f"{len(tu)           = :11,}")
print(f"{sys.getsizeof(tu) = :11,}")
print()

print("array.array:")
ar = array.array("d", li)
print(f"{len(ar)           = :11,}")
print(f"{ar.itemsize       = :11,}")
print(f"{sys.getsizeof(ar) = :11,}")
print()

print("np.ndarray:")
nd = np.array(np.arange(n, dtype = np.float64))   #"array range"
print(f"{nd.size           = :11,}")
print(f"{nd.itemsize       = :11,}")
print(f"{nd.nbytes         = :11,}")
print(f"{sys.getsizeof(nd) = :11,}")

sys.exit(0)
list:
len(li)           = 100,000,000
sys.getsizeof(li) = 859,724,464
sys.getsizeof(li) / len(li) = 8.59724464

tuple:
len(tu)           = 100,000,000
sys.getsizeof(tu) = 800,000,040

array.array:
len(ar)           = 100,000,000
ar.itemsize       =           8
sys.getsizeof(ar) = 800,000,064

np.ndarray:
nd.size           = 100,000,000
nd.itemsize       =           8
nd.nbytes         = 800,000,000
sys.getsizeof(nd) = 800,000,096

Many ways to create an np.ndarray.

You don’t have to memorize these examples. Their purpose is to reassure you that you can probably create your np.ndarry any way you want to. See Array creation and Array creation routines.

from a list

import sys
import numpy as np

oneTrain = [242, 238, 231, 225, 215, 207]   #a list
a = np.array(oneTrain)
print(f"{a = }")
sys.exit(0)
a = array([242, 238, 231, 225, 215, 207])

np.zeroes

import sys
import numpy as np

a = np.zeros(10, dtype = np.int64)
print(f"{a = }")

a = np.zeros(10, dtype = np.float64)
print(f"{a = }")

sys.exit(0)
a = array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
a = array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

np.ones

import sys
import numpy as np

a = np.ones(10, dtype = np.int64)
print(f"{a = }")

a = np.ones(10, dtype = np.float64)
print(f"{a = }")

sys.exit(0)
a = array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
a = array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

np.full

import sys
import numpy as np

a = np.full(10, fill_value = 100, dtype = np.int64)
print(f"{a = }")

sys.exit(0)
a = array([100, 100, 100, 100, 100, 100, 100, 100, 100, 100])

np.arange is like the Python built-in range.

import sys
import numpy as np

a = np.arange(10, 60, 10, dtype = np.int64)   #like range
print(f"{a = }")

a = np.arange(10.0, 60.0, 10.0, dtype = np.float64)
print(f"{a = }")

sys.exit(0)
a = array([10, 20, 30, 40, 50])
a = array([10., 20., 30., 40., 50.])

Divide the interval into thirds.

import sys
import numpy as np

a = np.linspace(0.0, 100.0, 4)
print(f"{a = }")
sys.exit(0)
a = array([  0.        ,  33.33333333,  66.66666667, 100.        ])

Create the np.ndarray, but leave it full of garbage.

You can then overwrite the garbage values; see the “surcharge” comment below.

import sys
import numpy as np

a = np.empty(10, dtype = np.int64)   #unpredictable values
print(f"{a = }")
print()

a = np.empty(10, dtype = np.float64)
print(f"{a = }")

sys.exit(0)
a = array([-2305843009213693952, -2305843009213693952,      140572591521797,
            140572591631200,      140572603403360,      140572602095536,
            140572602284256,      140572602277824,      140572602257280,
           1547947485859520])

a = array([-2.68156159e+154, -1.49457815e-154,  2.47032823e-323,
        0.00000000e+000,  0.00000000e+000,  0.00000000e+000,
        0.00000000e+000,  0.00000000e+000,  0.00000000e+000,
        6.95335581e-309])

Start with no elements, then append.

import sys
import numpy as np

a = np.array([], dtype = np.int64)   #Create an empty ndarray, of length 0.
print(f"{a = }")

a = np.append(a, 10)   #See also np.insert, np.delete, etc.
print(f"{a = }")

a = np.append(a, [20, 30])
print(f"{a = }")

sys.exit(0)
a = array([], dtype=int64)
a = array([10])
a = array([10, 20, 30])

Random prices

import sys
import numpy as np

a = np.random.uniform(size = 5, low = 0.00, high = 100.00)
print(f"{a = }")

a = np.around(a, 2)   #Round to the nearest cent.
print(f"{a = }")

sys.exit(0)
a = array([47.92096656,  1.79325858, 12.29409195,  1.03928249, 62.07859764])
a = array([47.92,  1.79, 12.29,  1.04, 62.08])

Read from a text file.

import sys
import numpy as np

a = np.genfromtxt("infile.txt", dtype = np.int64)
print(f"{a = }")
sys.exit(0)
a = array([10, 20, 30, 40, 50])

Vectorized operations

import sys
import numpy as np

prices = np.array([10.00, 20.00, 30.00, 40.00, 50.00], dtype = np.float64)
print(f"{prices = }")

#Three ways to inflict six percent inflation.

#1. Change every value in the np.ndarray.
#for i in range(len(prices)):
#    prices[i] *= 1.06 #could also say prices[i] = prices[i] * 1.06

#2. Create a new list.  Then create a new np.ndarray and discard the old one.
#prices = np.array([price * 1.06 for price in prices])

#3. Use a NumPy vectorized operation to change every value in the np.ndarray.
prices *= 1.06         #could also say prices = prices * 1.06

print(f"{prices = }")
print()

for price in prices:
    print(f"${price:.2f}") #Print money with 2 digits to the right of the decimal point.
print()

dictionary = {"float": lambda price: f"${price:.2f}"}
s = np.array2string(prices, formatter = dictionary, separator = "\n")
print(s.strip("[]"))

sys.exit(0)
prices = array([10., 20., 30., 40., 50.])
prices = array([10.6, 21.2, 31.8, 42.4, 53. ])

$10.60
$21.20
$31.80
$42.40
$53.00

$10.60
$21.20
$31.80
$42.40
$53.00

Try each of the following vectorized operations.

#Modify the existing np.ndarray.
prices *= 1.06   #could also say prices = prices * 1.06
prices += 1.99   #could also say prices = prices + 1.99
prices **= .5    #could also say prices = prices ** .5   #take the square root
prices = np.sqrt(prices)   #also try np.log, np.exp, etc.
prices = -prices
prices = 1 / prices

#Create a new np.ndarray.
newPrices = prices + 1.99
newPrices = prices.astype(np.float128)
"Create a third np.ndarray from two existing ones."

import sys
import numpy as np

prices     = np.array([10.00, 20.00, 30.00, 40.00, 50.00]) #dtype defaults to np.float64
surcharges = np.array([ 1.00,  1.00,  2.00,  2.00,  3.50])

#Three ways to add the two np.ndarrays together.

#total = np.empty(len(prices), dtype = np.float64)   #born full of garbage
#for i in range(len(total)):
#    total[i] = prices[i] + surcharges[i]

#total = np.array([], dtype = np.float64)
#for price, surcharge in zip(prices, surcharges):
#    total = np.append(total, price + surcharge)

total = prices + surcharges
print(f"{total = }")

#print(f"{np.around(total) = }")     #Round to closest dollar.
#print(f"{np.around(total,  1) = }") #Round to closest dime.
#print(f"{np.around(total, -1) = }") #Round to closest 10 dollars.
sys.exit(0)
total = array([11. , 21. , 32. , 42. , 53.5])
"Create an np.ndarray of bools."

import sys
import numpy as np

prices = np.array([10.00, 20.00, 30.00, 40.00, 50.00])
expensive = prices > 25.00

print(f"{expensive = }")
print(f"{type(expensive) = }")
print(f"{expensive.dtype.name = }")
print(f"{expensive.dtype.itemsize = }")

sys.exit(0)
expensive = array([False, False,  True,  True,  True])
type(expensive) = <class 'numpy.ndarray'>
expensive.dtype.name = 'bool'
expensive.dtype.itemsize = 1

Vectorized operations are faster than for loops.

import sys
import timeit

n = 10_000_000 #how many prices
repeat = 4     #Do the experiment 4 times.
number = 1     #Each experiment consists of 1 execution of the code.

setup = f"""\
import numpy as np
prices = np.random.uniform(size = {n}, low = 0.00, high = 100.00)
prices = np.around(prices, 2)"""   #Round to closest cent.

code = "prices *= 1.06"

s = timeit.repeat(code, setup = setup, repeat = repeat, number = number)
print(f"{s} seconds")
minArray = min(s)
print(f"{minArray} seconds is the minimum.")
print()

setup = f"""\
import random
prices = [round(random.uniform(0.00, 100.00), 2) for _ in range({n})]"""

code = "for i in range(len(prices)): prices[i] *= 1.06"

s = timeit.repeat(code, setup = setup, repeat = repeat, number = number)
print(f"{s} seconds")
minList = min(s)
print(f"{minList} seconds is the minimum.")
print()

print(f"The np.ndarray is {minList / minArray} times faster than the list.")
sys.exit(0)
[0.012434812999999822, 0.005454579000000015, 0.005510028999999861, 0.005391932999999849] seconds
0.005391932999999849 seconds is the minimum.

[0.8033366490000002, 0.7729674679999992, 0.7890182360000004, 0.7625401109999999] seconds
0.7625401109999999 seconds is the minimum.

The np.ndarray is 141.42240102019466 times faster than the list.

Things to try

  1. Create a two-dimensional ndarray.
    import sys
    import functools
    import operator
    import numpy as np
    
    data = [               #a list of lists
        [ 0,  1,  2,  3],
        [10, 11, 12, 13],
        [20, 21, 22, 23]
    ]
    
    nd = np.array(data)
    print(f"{nd = }")
    print()
    
    for row in nd:
        for i in row:
            print(f"{i:2}  ", end = "")
        print()
    print()
    
    print(f"{len(nd) = }")       #number of rows
    print(f"{nd.shape = }")      #a tuple
    print(f"{len(nd.shape) = }") #number of dimensions
    print(f"{nd.ndim = }")       #simpler way to get the number of dimensions
    print()
    
    product = 1
    for i in nd.shape:
        product *= i
    print(f"total number of elements = {product}")
    
    #Simpler way to get the total number of elements.
    product = functools.reduce(operator.mul, nd.shape, 1)
    print(f"total number of elements = {product}")
    
    #Even simpler way to get the total number of elements.
    print(f"total number of elements = {np.prod(nd.shape)}")
    
    #Simplest way to get the total number of elements.
    print(f"total number of elements = {nd.size}")
    print()
    
    print(f"{nd.itemsize = }")
    print(f"{nd.nbytes = }")
    print(f"{sys.getsizeof(nd) = }")
    sys.exit(0)
    

    nd contains 3 × 4 × 8 = 96 bytes of data, plus 112 bytes of overhead, for a total of 208 bytes.

    nd = array([[ 0,  1,  2,  3],
           [10, 11, 12, 13],
           [20, 21, 22, 23]])
    
     0   1   2   3
    10  11  12  13
    20  21  22  23
    
    len(nd) = 3
    nd.shape = (3, 4)
    len(nd.shape) = 2
    nd.ndim = 2
    
    total number of elements = 12
    total number of elements = 12
    total number of elements = 12
    total number of elements = 12
    
    nd.itemsize = 8
    nd.nbytes = 96
    sys.getsizeof(nd) = 208
    
  2. Can you make a three-dimensional np.ndarray? You’ll have to add another for loop. Which way would you prefer to type the data?
    data = [
        [
            [  0,   1,   2,   3],
            [ 10,  11,  12,  13],
            [ 20,  21,  22,  23]
        ],
        [
            [100, 101, 102, 103],
            [110, 111, 112, 113],
            [120, 121, 122, 123]
        ],
    ]
    
    data = [
        [[  0,   1,   2,   3], [ 10,  11,  12,  13], [ 20,  21,  22,  23]],
        [[100, 101, 102, 103], [110, 111, 112, 113], [120, 121, 122, 123]],
    ]