Setup Jupyter and other Machine Learning tools on macOS

Setup Jupyter and other Machine Learning tools on macOS

Jupyter Notebook is an interactive environment for running code in the browser. It allows you to create interactive documents that contain live code, rich text elements and visualizations. It's also a widely used tool for Data Scientists to make prototypes or demonstrations.

ref:
http://jupyter.org/

Install

$ brew install freetype gcc libffi libpng openssl pkg-config
$ pip install -U \
  cython \
  numpy \
  scipy \
  graphviz \
  matplotlib \
  bokeh \
  seaborn \
  pydotplus \
  scikit-learn \
  nltk \
  pandas \
  pydataset \
  jupyter

$ pip install https://github.com/ipython-contrib/jupyter_contrib_nbextensions/tarball/master && \
  jupyter contrib nbextension install --user

# start your notebook server
$ jupyter notebook

ref:
https://jupyter.readthedocs.io/en/latest/running.html#running
https://github.com/ipython-contrib/jupyter_contrib_nbextensions

Or you could just download Anaconda and install it.
https://www.continuum.io/downloads#osx

Configuration

in ~/.ipython/profile_default/ipython_config.py

c = get_config()

c.InteractiveShell.ast_node_interactivity = 'all'

# c.InteractiveShellApp.matplotlib = 'notebook'
c.InteractiveShellApp.matplotlib = 'inline'

Usage

# show image
from IPython.display import Image
Image('iris.png')

# show pdf
from IPython.display import IFrame
IFrame('iris.pdf', width='100%', height=700)
Pandas cookbook

Pandas cookbook

Pandas is a Python library providing easy-to-ues data structure and data analysis tools.

ref:
http://pandas.pydata.org/

Read a csv file

A DataFrame is a tablular data structure comprised of rows and columns. You can also think of a DataFrame as a group of Series objects that share an index (the column names).

import pandas as pd

# specify columns you need to import
columns = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z']
diamonds = pd.read_csv('datasets/diamonds/diamonds.csv', usecols=columns)

# add a custom header row (columns)
df = pd.read_csv('movielens/u.data', sep='\t', names=['UserID', 'ItemId ', 'Rating', 'Timestamp'])

df.index
df.columns

# show summary
df.info()

# show statistics summary, numeric columns only
df.describe()

ref:
http://pandas.pydata.org/pandas-docs/stable/10min.html
http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes/

Convert multiple lists into a DataFrame

pd.DataFrame({
    'dice': list_1[:50],
    'pearsonr': list_2[:50],
    'cosine_similarity': list_3[:50],
})
# output:
#   cosine_similarity   dice   pearsonr
# 0 u3                  u1     u2
# 1 u3                  u1     u2
# 2 u3                  u1     u2
# 3 u3                  u1     u2

Create a DataFrame from a MySQL table

url = 'mysql://root:[email protected]:3306/albedo'
sql = """
SELECT from_user_id AS user, repo_id AS item, 1 AS rating
FROM app_repostarring
WHERE stargazers_count >= 10;
"""
user_item_df = pd.read_sql(sql, con=url)

url = 'mysql://root:[email protected]:3306/albedo'
user_item_df = pd.read_sql_table('app_repostarring', con=url, columns=['from_user_id', 'repo_id'])

Create a DataFrame from a Django queryset

from django.db import connection

query, params = your_django_queryset.query.sql_with_params()
pd.io.sql.read_sql_query(query, connection, params=params)

ref:
https://www.iwoca.co.uk/blog/2016/09/02/using-pandas-django-faster/

Drop columns

# axis=0 means rows
# axis=1 means columns
train = df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)

ref:
http://stackoverflow.com/questions/22149584/what-does-axis-in-pandas-mean

Drop elements by indexes of a series

user_starred = df.loc['vinta', :][df.loc['vinta', :] == 1]
user_unstarred = pdf.loc['vinta', :].drop(user_starred.index)
user_unstarred.sort_values(ascending=False)

Select columns

# select all rows of Fare column
# return a Series
df['Fare']

# select the first 5 rows of repo_description column
df['repo_description'][:5]

# select the 152th row of Fare column
df['Fare'][152]
# equals to
df.Fare[152]

# select specific columns
df[['Survived', 'Age', 'Sex']]
# equals to
df.loc[:, ['Survived', 'Age', 'Sex']]

Select rows

Explicitly use df.loc[] for label-based indexing and use df.iloc[] for positional indexing, don't use df.ix[].

# return a DataFrame
df.loc['row_a':'row_z']

df.loc[['row_a', 'row_b', 'row_c'], ['column_1', 'column_2']]

# in these cases, labeled indexes are numbers
df.loc[0]
df.loc[0:10, ['repo_description']]

# select the first row, it's a Series object
df.iloc[0]

# you can only use positional indexes for both row and column selections with iloc
# df.iloc[row_selection, column_selection]
df.iloc[0, 1]

# select rows at index 3, 4, 5, 6
df.iloc[3:6]

# select rows at index 2, 5, 10
df.iloc[[2, 5, 10]]

df.iloc[2]['similarity']

ref:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#different-choices-for-indexing
http://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/

Conditionally select

df[df['Age'] >= 70]
df[(df['Age'] >= 60) & (df['Sex'] == 'female')]
df[(df['Age'] >= 60) | (df['Age'] <= 20)]

df['Sex'].unique()
# output:
# array(['male', 'female'], dtype=object)

# show counts of unique values
df['Embarked'].value_counts()
# output:
# S    646
# C    168
# Q     77

df['Embarked'].value_counts(normalize=True)
# output:
# S    0.725028
# C    0.188552
# Q    0.086420

df['Embarked'].value_counts().max()
# output:
# 646

df['Embarked'].value_counts().idxmax()
# output:
# S

df['Survived'][df['Sex'] == 'male'].value_counts()
df['Survived'][df['Sex'] == 'female'].value_counts(normalize=True)

df['Age'].nlargest(3)
# output:
# 630    80.0
# 851    74.0
# 96     71.0

# select rows that passenger class is first class (1) or second class (2)
df[df['Pclass'].isin([1, 2])]

Select only non NaN

df[pd.isnull(df['similarity']) == False]

Get distinct values of a column

item_ids = user_item_df['item'].unique()

Rename columns

df = df.rename(columns={'from_user_id': 'user', 'repo_id': 'item'})

Convert a DataFrame into a utility matrix (rating matrix)

matrix_df = user_item_df.pivot(index='user', columns='item', values='rating')

ref:
http://stackoverflow.com/questions/37576594/rearrange-a-pandas-data-frame-to-create-a-2d-ratings-matrix

Convert a utility matrix into a sparse SciPy COO matrix

from scipy.sparse import coo_matrix

temp_df = user_item_df.set_index(['user', 'item'])
star_matrix = coo_matrix((
    user_item_df['rating'], (temp_df.index.labels[0], temp_df.index.labels[1])
))
# <10003x423 sparse matrix of type '<class 'numpy.float64'>'
# with 245579 stored elements in COOrdinate format>

Group by

df = pd.read_csv('movielens/u.data', sep='\t', names=['UserID', 'ItemId ', 'Rating', 'Timestamp'])

df.groupby(['Rating'])['UserID'].count()

ref:
https://zhuanlan.zhihu.com/p/25184830

Iterate a DataFrame

for index, row in df.iterrows():
    username = index
    score = row['similarity']
    print('{0} / https://github.com/{1}'.format(score, username))

ref:
http://stackoverflow.com/questions/7837722/what-is-the-most-efficient-way-to-loop-through-dataframes-with-pandas

Handle missing values

# if at least values of 4 columns are missing, the row is dropped
df.dropna(thresh=4)

# show that whether columns contain missing value
df.isnull().any()

# fill missing values
df['Embarked'] = df['Embarked'].fillna('S')

# replace missing values with median
df['Age'] = df['Age'].fillna(df['Age'].median())

# replace missing values with the most common value
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].value_counts().idxmax())

Sort a DataFrame by values

# sort by age, from the largest to the smallest
sorted_df = df.sort_values(by='Age', ascending=False)

Operate

df['Sex'] = df['Sex'].astype('category')

# group by
df.loc[:, ['Pclass', 'Fare']].groupby('Pclass').sum()
df.loc[:, ['Pclass', 'Survived', 'Fare']].groupby(('Pclass', 'Survived')).sum()

# replace
df.loc[df['Sex'] == 'male', 'Sex'] = 0
df.loc[df['Sex'] == 'female', 'Sex'] = 1
# equals to
df['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
# dont't do this
df['Sex'][df['Sex'] == 'male'] = 0
df['Sex'][df['Sex'] == 'female'] = 1

# add a column
df['Child'] = 0
df.loc[df['Age'] < 18, 'Child'] = 0
df.loc[df['Age'] >= 18, 'Child'] = 1

ref:
http://tomaugspurger.github.io/modern-1.html

Convert a Series into a DataFrame

data = {
    0: 1.0,
    1: 0.15502648795530585,
    2: 0.07641502252040476,
    3: 0.09437745470567728,
    4: 0.026112204840046276,
    5: 0.090005329668190928,
}
sdf = pd.Series(sim_score).to_frame('similarity')

Iterate a Series

for index, value in my_non_starred.iteritems():
    print(index, value)

my_non_starred.index

Sort a Series

similarities[0].argsort()
NumPy cookbook

NumPy cookbook

NumPy is the fundamental package for many scientific computing libraries in Python, it providing efficient muilti-dimensional array operating.

ref:
http://www.numpy.org/

create a n-dimensional array

Each Numpy array can only contain the same data type.

import numpy as np

# a 2-dimensional array (also known as a matrix) of size 2 x 4
matrix = np.array([
    # columns
    # 0    1    2    3
    [7.0, 8.0, 6.0, 5.0],  # 0
    [4.0, 2.0, 1.0, 9.0],  # 1
                           # rows
])

# get demensions (x, y, z, ...)
matrix.shape
# output:
# (2, 4)

# create an array of given shape and type, filled with 0
np.zeros((100,), dtype=np.int)

# generate an array contains 100 evenly spaced numbers that from 0 to 10
x = np.linspace(0, 10, 100)

m = np.random.randint(5, size=(4, 4))

create an array inside a for-loop

column_count = all_repo_array.shape[0]
pre_matrix = np.empty((0, column_count), dtype=np.int8)
for username in all_user_array:
    user_starred_repos = RepoStarring.objects \
        .filter(from_username=username) \
        .values_list('repo_full_name', flat=True) \
        .iterator()
    user_starred_repo_array = np.fromiter(user_starred_repos, np.dtype('U140'))
    user_row = np.in1d(all_repo_array, user_starred_repo_array, assume_unique=True)
    pre_matrix = np.append(pre_matrix, [user_row], axis=0)

ref:
http://akuederle.com/create-numpy-array-with-for-loop

create an array from a file

# genfromtxt's default dtype is float, it converts non-numeric value to nan (not a number)
# to avoid nan, we read values as U75 (75 bytes unicode)
matrix = np.genfromtxt('train.csv', dtype='U75', skip_header=1, delimiter=',')

create an array from a Django queryset

# you must specify length of values ("U" means unicode)
# or an exception will be raise: Must specify length when using variable-size data-type
repos = RepoStarring.objects.all().values_list('repo_full_name', flat=True).distinct().iterator()
repo_array = np.fromiter(repos, np.dtype('U140'))

ref:
http://stackoverflow.com/questions/1741107/how-do-i-convert-a-django-queryset-to-numpy-record-array

convert a numpy matrix to a sparse matrix

from scipy import sparse

m = np.random.randint(5, size=(4, 6))
sm = sparse.csr_matrix(m)

select

matrix[0]
# equals to
matrix[0, :]
# output:
# array([ 7.,  8.,  6.,  5.])

# select the second row and all columns
matrix[1]
# output:
# array([ 4.,  2.,  1.,  9.])

# select all rows and the second column
matrix[:, 1]
# output:
# array([ 8.,  2.])

# matrix[row, column]
matrix[0][0] == matrix[0, 0] == 7.0

# select all rows and columns with index 0, 1, 2
matrix[:, 1:3]
# output:
# array([[ 8.,  6.],
#        [ 2.,  1.]])

compare

Numpy could make comparisons across an entire array.

y = np.array([4, 9, 6, 3, 1])

less5 = y < 5
# output:
# array([True, False, False,  True,  True], dtype=bool)

y[less5]
# output:
# array([4, 3, 1])

np_positions = np.array(['GK', 'M', 'A', 'D', 'D', 'M'])
np_heights = np.array([191, 184, 185, 183, 179, 179])

# extract all the heights of the goalkeepers
gk_heights = np_heights[np_positions == 'GK']
# output:
# array([191])

# extract all the heights of the all the other players
other_heights = np_heights[np_positions != 'GK']
# output:
# array([184, 185, 183, 179, 179])

countries_canada = world_alcohol[world_alcohol[:, 2] == 'Canada']
# output:
# array([['1984', 'Americas', 'Canada', 'Spirits', '3.35'],
#        ['1989', 'Americas', 'Canada', 'Wine', '1.27'],
#        ['1984', 'Americas', 'Canada', 'Beer', '5'],
#        ['1985', 'Americas', 'Canada', 'Beer', '4.94'],
#        ...

years_1984 = world_alcohol[world_alcohol[:, 0] == '1984']
# output:
# array([['1984', 'Africa', 'Nigeria', 'Other', '6.1'],
#        ['1984', 'Eastern Mediterranean', 'Afghanistan', 'Other', '0'],
#        ['1984', 'Americas', 'Costa Rica', 'Wine', '0.06'],
#        ...

is_algeria_and_1986 = (world_alcohol[:, 0] == '1986') & (world_alcohol[:, 2] == 'Algeria')
rows_with_algeria_and_1986 = world_alcohol[is_algeria_and_1986, :]

select non NaN rows

df[np.isnan(df['similarity']) == False]

test whether each element of a 1-D array is also present in a second array.

all_array = np.array(['vinta/awesome-python', 'vinta/pangu.js', 'django/django', 'kennethreitz/requests'])
my_array = np.array(['vinta/awesome-python', 'django/django'])
mask = np.in1d(all_array, my_array, assume_unique=True)
vector = mask.astype(int)
# output:
# array([1, 0, 1, 0])

ref:
http://stackoverflow.com/questions/7088625/what-is-the-most-efficient-way-to-check-if-a-value-exists-in-a-numpy-array

replace

matrix = np.array([
    [5, 10, 15], 
    [20, 25, 30],
    [35, 40, 45],
])
second_column_25 = matrix[:, 1] == 25
matrix[second_column_25, 1] = 100
# output:
# array([[  5,  10,  15],
#        [ 20, 100,  30],
#        [ 35,  40,  45]])

# replace all instances of the string 1986 in the first column of world_alcohol with the string 2014
world_alcohol[world_alcohol[:, 0] == "1986", 0] = "2014"

compute

weights = np.array([81.6,  97.0,  95.2])
heights = np.array([1.9,  1.8,  1.7])
bmis = weights / (heights ** 2)
# output:
# array([22.60387812, 29.9382716, 32.94117647])

light_bmi = bmis[bmis <= 30]
# output:
# array([22.60387812, 29.9382716])

mat = np.arange(start=1, stop=7).reshape(3, 2)
# output 
# array([[1, 2],
#        [3, 4],
#        [5, 6]])

mat * np.array([10, 100])
# output:
# array([[ 10, 200],
#        [ 30, 400],
#        [ 50, 600]])

# type coercion
np.array([True, 1, 2]) + np.array([3, 4, False])
# output:
# array([4, 5, 2])

# type convertion
arr = np.array([0.8, 0.4, 2.5, 5.123])
arr = arr.astype(int)

matrix = np.array([
    [5, 10, 15], 
    [20, 25, 30],
    [35, 40, 45],
])

# perform the operation on each row
matrix.sum(axis=1)
# output:
# array([ 30,  75, 120])

# perform the operation on each column
matrix.sum(axis=0)
# output:
# array([60, 75, 90])

dot product 內積

# for 1-D array it is dot product of two arrays
a = np.array([1, 3, -5])
b = np.array([4, -2, -1])
a.dot(b)
# output:
# 3

# for 2-D arrays it is equivalent to matrix multiplication
# a 矩陣的 row 為係數表,b 矩陣為向量表
# 1[3, 1] + 0[2, 1] + 2[1, 0]
# -1[3, 1] + 3[2, 1] + 1[1, 0]
a = np.array((
    (1, 0, 2),
    (-1, 3, 1)
))
b = np.array((
    (3, 1),
    (2, 1),
    (1, 0)
))
a.dot(b)
# output:
# array([[5, 1],
#        [4, 2]])

ref:
https://zh.wikipedia.org/wiki/%E7%9F%A9%E9%99%A3%E4%B9%98%E6%B3%95

sparsity of a matrix

sparsity = float(len(ratings.nonzero()[0])) / (ratings.shape[0] * ratings.shape[1])
print('Sparsity: {0:.2f}%'.format(sparsity * 100)) 
# Sparsity: 6.30%

ref:
http://stackoverflow.com/questions/38708621/how-to-calculate-percentage-of-sparsity-for-a-numpy-array-matrix

statistics

height = np.round(np.random.normal(1.76, 0.20, 5000), 2)
weight = np.round(np.random.normal(60.32, 15, 5000), 2)
city = np.column_stack((height, weight))
# output:
# array([[  1.48,  60.49],
#        [  1.48,  51.5 ],
#        [  1.73,  54.61],
#        ..., 
#        [  1.55,  61.13],
#        [  1.3 ,  55.36],
#        [  1.5 ,  72.42]])

# 平均值
np.mean(height)

# 中位數
np.median(height)

# 相關係數
np.corrcoef(city[:, 0], city[:, 1])
# output:
# array([[ 1.        ,  0.02700209],
#        [ 0.02700209,  1.        ]])

# 標準差(Standard Deviation)
np.std(city[:, 0])
# 0.19767575712767613

ref:
https://campus.datacamp.com/courses/intro-to-python-for-data-science/chapter-4-numpy?ex=8
https://campus.datacamp.com/courses/intro-to-python-for-data-science/chapter-4-numpy?ex=13