All Posts Tagged “data visualization”

matplotlib cookbook

Matplotlib is the most popular 2D plotting library in Python. Seaborn is built on top of matplotlib and especially aims to exploratory analysis.

ref:
http://matplotlib.org/
http://seaborn.pydata.org/

My notebook
https://github.com/vinta/machine-learning-notebooks/blob/master/matplotlib_cookbook.ipynb

matplotlib

import matplotlib
import matplotlib.pyplot as plt

matplotlib.style.use('ggplot')

ref:
http://www.scipy-lectures.org/intro/matplotlib/index.html
https://tonysyu.github.io/raw_content/matplotlib-style-gallery/gallery.html

matplotlib has two interface styles:

# matlab / pylab / pyplot style
plt.plot(x, np.sin(x))

# object-oriented style
fig, ax = plt.subplots()
ax.plot(x, np.sin(x))

Change Line Colors and Styles

plt.plot(x, np.sin(x - 0), color='blue')        # specify color by name
plt.plot(x, np.sin(x - 1), color='g')           # short color code (works for rgb & cmyk)
plt.plot(x, np.sin(x - 2), color='0.75')        # Greyscale between 0 and 1
plt.plot(x, np.sin(x - 3), color='#FFDD44')     # Hex color code (RRGGBB from 00 to FF)
plt.plot(x, np.sin(x - 4), color=(1.0,0.2,0.3)) # RGB tuple, values between 0 and 1
plt.plot(x, np.sin(x - 5), color='chartreuse'); # all html color names are supported;

plt.plot(x, x + 4, linestyle='-')  # solid
plt.plot(x, x + 5, linestyle='--') # dashed
plt.plot(x, x + 6, linestyle='-.') # dashdot
plt.plot(x, x + 7, linestyle=':')  # dotted;

plt.plot(x, x + 0, '-g')  # solid green
plt.plot(x, x + 1, '--c') # dashed cyan
plt.plot(x, x + 2, '-.k') # dashdot black
plt.plot(x, x + 3, ':r')  # dotted red;
plt.plot(x, x + 4, 'o', color='black');

Change Labels

plt.plot(x, np.sin(x))
plt.title("A Sine Curve")
plt.xlabel("x")
plt.ylabel("sin(x)")

plt.plot(x, np.sin(x), '-g', label='sin(x)')
plt.plot(x, np.cos(x), ':b', label='cos(x)')
plt.legend();

Integrate with Seaborn

from pydataset import data
import pandas as pd
import seaborn as sns

columns = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z']
diamonds = pd.read_csv('datasets/diamonds/diamonds.csv', usecols=columns)

sns.countplot(x='cut', data=diamonds)

sns.barplot(x='cut', y='price', data=diamonds)

sns.jointplot(x='carat', y='price', data=diamonds, size=8, alpha=0.25, marker='.')

sns.pairplot(diamonds, hue='cut')

g = sns.FacetGrid(diamonds, col='color', hue='color', col_wrap=4)
g.map(sns.regplot, 'carat', 'price')

ref:
http://seaborn.pydata.org/tutorial.html
http://seaborn.pydata.org/tutorial/axis_grids.html

Pandas cookbook

Pandas is a Python library providing easy-to-ues data structure and data analysis tools.

ref:
http://pandas.pydata.org/
https://github.com/vinta/machine-learning-notebooks/blob/master/pandas_cookbook.ipynb

create

A DataFrame is a tablular data structure comprised of rows and columns. You can also think of a DataFrame as a group of Series objects that share an index (the column names).

import pandas as pd

series = pd.Series(
    [7, 'Heisenberg', 3.14, -1789710578, 'Happy Eating!'],
    index=['A', 'Z', 'C', 'Y', 'E'])

train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

# specify columns you need to import
columns = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z']
diamonds = pd.read_csv('datasets/diamonds/diamonds.csv', usecols=columns)

train.index
train.columns

# show summary
train.info()

# show statistics summary, numeric columns only
train.describe()

ref:
http://pandas.pydata.org/pandas-docs/stable/10min.html
http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes/

drop

# axis=0 means rows
# axis=1 means columns
train = train.drop(['PassengerId', 'Name', 'Ticket'], axis=1)

ref:
http://stackoverflow.com/questions/22149584/what-does-axis-in-pandas-mean

select

Explicitly use df.loc[] for label-based indexing and use df.iloc[] for positional indexing, don't use df.ix[].

ref:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#different-choices-for-indexing

df.loc[['A', 'B', 'C'], ['name', 'type']]

df.loc['A':'Z']

# in this case, labeled indexes are numbers
train.loc[0]

# select the first row, it's a Series object
train.iloc[0]

# select rows at index 3, 4, 5, 6
train.iloc[3:6]

# select rows at index 2, 5, 10
train.iloc[[2, 5, 10]]

# select all rows of Fare column
test['Fare']

# select the 152th row of Fare column
test['Fare'][152]
# equals to
test.Fare[152]

# select specific columns
train[['Survived', 'Age', 'Sex']]
# equals to
train.loc[:, ['Survived', 'Age', 'Sex']]

conditionally select

train[train['Age'] >= 70]
train[(train['Age'] >= 60) & (train['Sex'] == 'female')]
train[(train['Age'] >= 60) | (train['Age'] <= 20)]

train['Sex'].unique()
# output:
# array(['male', 'female'], dtype=object)

# show counts of unique values
train['Embarked'].value_counts()
# output:
# S    646
# C    168
# Q     77

train['Embarked'].value_counts(normalize=True)
# output:
# S    0.725028
# C    0.188552
# Q    0.086420

train['Embarked'].value_counts().max()
# output:
# 646

train['Embarked'].value_counts().idxmax()
# output:
# S

train['Survived'][train['Sex'] == 'male'].value_counts()
train['Survived'][train['Sex'] == 'female'].value_counts(normalize=True)

train['Age'].nlargest(3)
# output:
# 630    80.0
# 851    74.0
# 96     71.0

# select rows that passenger class is first class (1) or second class (2)
train[train['Pclass'].isin([1, 2])]

missing value

# show that whether columns contain missing value
train.isnull().any()

# fill missing values
train['Embarked'] = train['Embarked'].fillna('S')

# replace missing values with median
train['Age'] = train['Age'].fillna(train['Age'].median())

# replace missing values with the most common value
train['Embarked'] = train['Embarked'].fillna(train['Embarked'].value_counts().idxmax())

operate

train['Sex'] = train['Sex'].astype('category')

# sort by age, from the largest to the smallest
train.sort_values(by='Age', ascending=False)

# group by
train.loc[:, ['Pclass', 'Fare']].groupby('Pclass').sum()
train.loc[:, ['Pclass', 'Survived', 'Fare']].groupby(('Pclass', 'Survived')).sum()

# replace
train.loc[train['Sex'] == 'male', 'Sex'] = 0
train.loc[train['Sex'] == 'female', 'Sex'] = 1
# equals to
train['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
# dont't do this
train['Sex'][train['Sex'] == 'male'] = 0
train['Sex'][train['Sex'] == 'female'] = 1

# add a column
train['Child'] = 0
train.loc[train['Age'] < 18, 'Child'] = 0
train.loc[train['Age'] >= 18, 'Child'] = 1

ref:
http://tomaugspurger.github.io/modern-1.html