matplotlib cookbook

matplotlib cookbook

Matplotlib is the most popular 2D plotting library in Python. Seaborn is built on top of matplotlib and especially aims to exploratory analysis.

ref:
http://matplotlib.org/
http://seaborn.pydata.org/

My notebook
https://github.com/vinta/machine-learning-notebooks/blob/master/matplotlib_cookbook.ipynb

matplotlib

import matplotlib
import matplotlib.pyplot as plt

matplotlib.style.use('ggplot')

ref:
http://www.scipy-lectures.org/intro/matplotlib/index.html
https://tonysyu.github.io/raw_content/matplotlib-style-gallery/gallery.html

matplotlib has two interface styles:

# matlab / pylab / pyplot style
plt.plot(x, np.sin(x))

# object-oriented style
fig, ax = plt.subplots()
ax.plot(x, np.sin(x))

Change Line Colors and Styles

plt.plot(x, np.sin(x - 0), color='blue')        # specify color by name
plt.plot(x, np.sin(x - 1), color='g')           # short color code (works for rgb & cmyk)
plt.plot(x, np.sin(x - 2), color='0.75')        # Greyscale between 0 and 1
plt.plot(x, np.sin(x - 3), color='#FFDD44')     # Hex color code (RRGGBB from 00 to FF)
plt.plot(x, np.sin(x - 4), color=(1.0,0.2,0.3)) # RGB tuple, values between 0 and 1
plt.plot(x, np.sin(x - 5), color='chartreuse'); # all html color names are supported;

plt.plot(x, x + 4, linestyle='-')  # solid
plt.plot(x, x + 5, linestyle='--') # dashed
plt.plot(x, x + 6, linestyle='-.') # dashdot
plt.plot(x, x + 7, linestyle=':')  # dotted;

plt.plot(x, x + 0, '-g')  # solid green
plt.plot(x, x + 1, '--c') # dashed cyan
plt.plot(x, x + 2, '-.k') # dashdot black
plt.plot(x, x + 3, ':r')  # dotted red;
plt.plot(x, x + 4, 'o', color='black');

Change Labels

plt.plot(x, np.sin(x))
plt.title("A Sine Curve")
plt.xlabel("x")
plt.ylabel("sin(x)")

plt.plot(x, np.sin(x), '-g', label='sin(x)')
plt.plot(x, np.cos(x), ':b', label='cos(x)')
plt.legend();

Integrate with Seaborn

from pydataset import data
import pandas as pd
import seaborn as sns

columns = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z']
diamonds = pd.read_csv('datasets/diamonds/diamonds.csv', usecols=columns)

sns.countplot(x='cut', data=diamonds)

sns.barplot(x='cut', y='price', data=diamonds)

sns.jointplot(x='carat', y='price', data=diamonds, size=8, alpha=0.25, marker='.')

sns.pairplot(diamonds, hue='cut')

g = sns.FacetGrid(diamonds, col='color', hue='color', col_wrap=4)
g.map(sns.regplot, 'carat', 'price')

ref:
http://seaborn.pydata.org/tutorial.html
http://seaborn.pydata.org/tutorial/axis_grids.html

Pandas cookbook

Pandas cookbook

Pandas is a Python library providing easy-to-ues data structure and data analysis tools.

ref:
http://pandas.pydata.org/
https://github.com/vinta/machine-learning-notebooks/blob/master/pandas_cookbook.ipynb

create

A DataFrame is a tablular data structure comprised of rows and columns. You can also think of a DataFrame as a group of Series objects that share an index (the column names).

import pandas as pd

series = pd.Series(
    [7, 'Heisenberg', 3.14, -1789710578, 'Happy Eating!'],
    index=['A', 'Z', 'C', 'Y', 'E'])

train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

# specify columns you need to import
columns = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z']
diamonds = pd.read_csv('datasets/diamonds/diamonds.csv', usecols=columns)

train.index
train.columns

# show summary
train.info()

# show statistics summary, numeric columns only
train.describe()

ref:
http://pandas.pydata.org/pandas-docs/stable/10min.html
http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes/

drop

# axis=0 means rows
# axis=1 means columns
train = train.drop(['PassengerId', 'Name', 'Ticket'], axis=1)

ref:
http://stackoverflow.com/questions/22149584/what-does-axis-in-pandas-mean

select

Explicitly use df.loc[] for label-based indexing and use df.iloc[] for positional indexing, don't use df.ix[].

ref:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#different-choices-for-indexing

df.loc[['A', 'B', 'C'], ['name', 'type']]

df.loc['A':'Z']

# in this case, labeled indexes are numbers
train.loc[0]

# select the first row, it's a Series object
train.iloc[0]

# select rows at index 3, 4, 5, 6
train.iloc[3:6]

# select rows at index 2, 5, 10
train.iloc[[2, 5, 10]]

# select all rows of Fare column
test['Fare']

# select the 152th row of Fare column
test['Fare'][152]
# equals to
test.Fare[152]

# select specific columns
train[['Survived', 'Age', 'Sex']]
# equals to
train.loc[:, ['Survived', 'Age', 'Sex']]

conditionally select

train[train['Age'] >= 70]
train[(train['Age'] >= 60) & (train['Sex'] == 'female')]
train[(train['Age'] >= 60) | (train['Age'] <= 20)]

train['Sex'].unique()
# output:
# array(['male', 'female'], dtype=object)

# show counts of unique values
train['Embarked'].value_counts()
# output:
# S    646
# C    168
# Q     77

train['Embarked'].value_counts(normalize=True)
# output:
# S    0.725028
# C    0.188552
# Q    0.086420

train['Embarked'].value_counts().max()
# output:
# 646

train['Embarked'].value_counts().idxmax()
# output:
# S

train['Survived'][train['Sex'] == 'male'].value_counts()
train['Survived'][train['Sex'] == 'female'].value_counts(normalize=True)

train['Age'].nlargest(3)
# output:
# 630    80.0
# 851    74.0
# 96     71.0

# select rows that passenger class is first class (1) or second class (2)
train[train['Pclass'].isin([1, 2])]

missing value

# show that whether columns contain missing value
train.isnull().any()

# fill missing values
train['Embarked'] = train['Embarked'].fillna('S')

# replace missing values with median
train['Age'] = train['Age'].fillna(train['Age'].median())

# replace missing values with the most common value
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].value_counts().idxmax())

operate

train['Sex'] = train['Sex'].astype('category')

# sort by age, from the largest to the smallest
train.sort_values(by='Age', ascending=False)

# group by
train.loc[:, ['Pclass', 'Fare']].groupby('Pclass').sum()
train.loc[:, ['Pclass', 'Survived', 'Fare']].groupby(('Pclass', 'Survived')).sum()

# replace
train.loc[train['Sex'] == 'male', 'Sex'] = 0
train.loc[train['Sex'] == 'female', 'Sex'] = 1
# equals to
train['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
# dont't do this
train['Sex'][train['Sex'] == 'male'] = 0
train['Sex'][train['Sex'] == 'female'] = 1

# add a column
train['Child'] = 0
train.loc[train['Age'] < 18, 'Child'] = 0
train.loc[train['Age'] >= 18, 'Child'] = 1

ref:
http://tomaugspurger.github.io/modern-1.html