NumPy cookbook

NumPy cookbook

NumPy is the fundamental package for many scientific computing libraries in Python, it providing efficient muilti-dimensional array operating.

ref:
http://www.numpy.org/

create a n-dimensional array

Each Numpy array can only contain the same data type.

import numpy as np

# a 2-dimensional array (also known as a matrix) of size 2 x 4
matrix = np.array([
    # columns
    # 0    1    2    3
    [7.0, 8.0, 6.0, 5.0],  # 0
    [4.0, 2.0, 1.0, 9.0],  # 1
                           # rows
])

# get demensions (x, y, z, ...)
matrix.shape
# output:
# (2, 4)

# create an array of given shape and type, filled with 0
np.zeros((100,), dtype=np.int)

# generate an array contains 100 evenly spaced numbers that from 0 to 10
x = np.linspace(0, 10, 100)

m = np.random.randint(5, size=(4, 4))

create an array inside a for-loop

column_count = all_repo_array.shape[0]
pre_matrix = np.empty((0, column_count), dtype=np.int8)
for username in all_user_array:
    user_starred_repos = RepoStarring.objects \
        .filter(from_username=username) \
        .values_list('repo_full_name', flat=True) \
        .iterator()
    user_starred_repo_array = np.fromiter(user_starred_repos, np.dtype('U140'))
    user_row = np.in1d(all_repo_array, user_starred_repo_array, assume_unique=True)
    pre_matrix = np.append(pre_matrix, [user_row], axis=0)

ref:
http://akuederle.com/create-numpy-array-with-for-loop

create an array from a file

# genfromtxt's default dtype is float, it converts non-numeric value to nan (not a number)
# to avoid nan, we read values as U75 (75 bytes unicode)
matrix = np.genfromtxt('train.csv', dtype='U75', skip_header=1, delimiter=',')

create an array from a Django queryset

# you must specify length of values ("U" means unicode)
# or an exception will be raise: Must specify length when using variable-size data-type
repos = RepoStarring.objects.all().values_list('repo_full_name', flat=True).distinct().iterator()
repo_array = np.fromiter(repos, np.dtype('U140'))

ref:
http://stackoverflow.com/questions/1741107/how-do-i-convert-a-django-queryset-to-numpy-record-array

convert a numpy matrix to a sparse matrix

from scipy import sparse

m = np.random.randint(5, size=(4, 6))
sm = sparse.csr_matrix(m)

select

matrix[0]
# equals to
matrix[0, :]
# output:
# array([ 7.,  8.,  6.,  5.])

# select the second row and all columns
matrix[1]
# output:
# array([ 4.,  2.,  1.,  9.])

# select all rows and the second column
matrix[:, 1]
# output:
# array([ 8.,  2.])

# matrix[row, column]
matrix[0][0] == matrix[0, 0] == 7.0

# select all rows and columns with index 0, 1, 2
matrix[:, 1:3]
# output:
# array([[ 8.,  6.],
#        [ 2.,  1.]])

compare

Numpy could make comparisons across an entire array.

y = np.array([4, 9, 6, 3, 1])

less5 = y < 5
# output:
# array([True, False, False,  True,  True], dtype=bool)

y[less5]
# output:
# array([4, 3, 1])

np_positions = np.array(['GK', 'M', 'A', 'D', 'D', 'M'])
np_heights = np.array([191, 184, 185, 183, 179, 179])

# extract all the heights of the goalkeepers
gk_heights = np_heights[np_positions == 'GK']
# output:
# array([191])

# extract all the heights of the all the other players
other_heights = np_heights[np_positions != 'GK']
# output:
# array([184, 185, 183, 179, 179])

countries_canada = world_alcohol[world_alcohol[:, 2] == 'Canada']
# output:
# array([['1984', 'Americas', 'Canada', 'Spirits', '3.35'],
#        ['1989', 'Americas', 'Canada', 'Wine', '1.27'],
#        ['1984', 'Americas', 'Canada', 'Beer', '5'],
#        ['1985', 'Americas', 'Canada', 'Beer', '4.94'],
#        ...

years_1984 = world_alcohol[world_alcohol[:, 0] == '1984']
# output:
# array([['1984', 'Africa', 'Nigeria', 'Other', '6.1'],
#        ['1984', 'Eastern Mediterranean', 'Afghanistan', 'Other', '0'],
#        ['1984', 'Americas', 'Costa Rica', 'Wine', '0.06'],
#        ...

is_algeria_and_1986 = (world_alcohol[:, 0] == '1986') & (world_alcohol[:, 2] == 'Algeria')
rows_with_algeria_and_1986 = world_alcohol[is_algeria_and_1986, :]

select non NaN rows

df[np.isnan(df['similarity']) == False]

test whether each element of a 1-D array is also present in a second array.

all_array = np.array(['vinta/awesome-python', 'vinta/pangu.js', 'django/django', 'kennethreitz/requests'])
my_array = np.array(['vinta/awesome-python', 'django/django'])
mask = np.in1d(all_array, my_array, assume_unique=True)
vector = mask.astype(int)
# output:
# array([1, 0, 1, 0])

ref:
http://stackoverflow.com/questions/7088625/what-is-the-most-efficient-way-to-check-if-a-value-exists-in-a-numpy-array

replace

matrix = np.array([
    [5, 10, 15], 
    [20, 25, 30],
    [35, 40, 45],
])
second_column_25 = matrix[:, 1] == 25
matrix[second_column_25, 1] = 100
# output:
# array([[  5,  10,  15],
#        [ 20, 100,  30],
#        [ 35,  40,  45]])

# replace all instances of the string 1986 in the first column of world_alcohol with the string 2014
world_alcohol[world_alcohol[:, 0] == "1986", 0] = "2014"

compute

weights = np.array([81.6,  97.0,  95.2])
heights = np.array([1.9,  1.8,  1.7])
bmis = weights / (heights ** 2)
# output:
# array([22.60387812, 29.9382716, 32.94117647])

light_bmi = bmis[bmis <= 30]
# output:
# array([22.60387812, 29.9382716])

mat = np.arange(start=1, stop=7).reshape(3, 2)
# output 
# array([[1, 2],
#        [3, 4],
#        [5, 6]])

mat * np.array([10, 100])
# output:
# array([[ 10, 200],
#        [ 30, 400],
#        [ 50, 600]])

# type coercion
np.array([True, 1, 2]) + np.array([3, 4, False])
# output:
# array([4, 5, 2])

# type convertion
arr = np.array([0.8, 0.4, 2.5, 5.123])
arr = arr.astype(int)

matrix = np.array([
    [5, 10, 15], 
    [20, 25, 30],
    [35, 40, 45],
])

# perform the operation on each row
matrix.sum(axis=1)
# output:
# array([ 30,  75, 120])

# perform the operation on each column
matrix.sum(axis=0)
# output:
# array([60, 75, 90])

dot product 內積

# for 1-D array it is dot product of two arrays
a = np.array([1, 3, -5])
b = np.array([4, -2, -1])
a.dot(b)
# output:
# 3

# for 2-D arrays it is equivalent to matrix multiplication
# a 矩陣的 row 為係數表,b 矩陣為向量表
# 1[3, 1] + 0[2, 1] + 2[1, 0]
# -1[3, 1] + 3[2, 1] + 1[1, 0]
a = np.array((
    (1, 0, 2),
    (-1, 3, 1)
))
b = np.array((
    (3, 1),
    (2, 1),
    (1, 0)
))
a.dot(b)
# output:
# array([[5, 1],
#        [4, 2]])

ref:
https://zh.wikipedia.org/wiki/%E7%9F%A9%E9%99%A3%E4%B9%98%E6%B3%95

sparsity of a matrix

sparsity = float(len(ratings.nonzero()[0])) / (ratings.shape[0] * ratings.shape[1])
print('Sparsity: {0:.2f}%'.format(sparsity * 100)) 
# Sparsity: 6.30%

ref:
http://stackoverflow.com/questions/38708621/how-to-calculate-percentage-of-sparsity-for-a-numpy-array-matrix

statistics

height = np.round(np.random.normal(1.76, 0.20, 5000), 2)
weight = np.round(np.random.normal(60.32, 15, 5000), 2)
city = np.column_stack((height, weight))
# output:
# array([[  1.48,  60.49],
#        [  1.48,  51.5 ],
#        [  1.73,  54.61],
#        ..., 
#        [  1.55,  61.13],
#        [  1.3 ,  55.36],
#        [  1.5 ,  72.42]])

# 平均值
np.mean(height)

# 中位數
np.median(height)

# 相關係數
np.corrcoef(city[:, 0], city[:, 1])
# output:
# array([[ 1.        ,  0.02700209],
#        [ 0.02700209,  1.        ]])

# 標準差(Standard Deviation)
np.std(city[:, 0])
# 0.19767575712767613

ref:
https://campus.datacamp.com/courses/intro-to-python-for-data-science/chapter-4-numpy?ex=8
https://campus.datacamp.com/courses/intro-to-python-for-data-science/chapter-4-numpy?ex=13