NumPy cookbook

NumPy cookbook

NumPy is the fundamental package for many scientific computing libraries in Python, it providing efficient muilti-dimensional array operating.

ref:
http://www.numpy.org/

create a n-dimensional array

Each Numpy array can only contain the same data type.

import numpy as np

# a 2-dimensional array (also known as a matrix) of size 2 x 4
matrix = np.array([
    # columns
    # 0    1    2    3
    [7.0, 8.0, 6.0, 5.0],  # 0
    [4.0, 2.0, 1.0, 9.0],  # 1
                           # rows
])

# get demensions (x, y, z, ...)
matrix.shape
# output:
# (2, 4)

# create an array of given shape and type, filled with 0
np.zeros((100,), dtype=np.int)

# generate an array contains 100 evenly spaced numbers that from 0 to 10
x = np.linspace(0, 10, 100)

m = np.random.randint(5, size=(4, 4))

create an array inside a for-loop

column_count = all_repo_array.shape[0]
pre_matrix = np.empty((0, column_count), dtype=np.int8)
for username in all_user_array:
    user_starred_repos = RepoStarring.objects \
        .filter(from_username=username) \
        .values_list('repo_full_name', flat=True) \
        .iterator()
    user_starred_repo_array = np.fromiter(user_starred_repos, np.dtype('U140'))
    user_row = np.in1d(all_repo_array, user_starred_repo_array, assume_unique=True)
    pre_matrix = np.append(pre_matrix, [user_row], axis=0)

ref:
http://akuederle.com/create-numpy-array-with-for-loop

create an array from a file

# genfromtxt's default dtype is float, it converts non-numeric value to nan (not a number)
# to avoid nan, we read values as U75 (75 bytes unicode)
matrix = np.genfromtxt('train.csv', dtype='U75', skip_header=1, delimiter=',')

create an array from a Django queryset

# you must specify length of values ("U" means unicode)
# or an exception will be raise: Must specify length when using variable-size data-type
repos = RepoStarring.objects.all().values_list('repo_full_name', flat=True).distinct().iterator()
repo_array = np.fromiter(repos, np.dtype('U140'))

ref:
http://stackoverflow.com/questions/1741107/how-do-i-convert-a-django-queryset-to-numpy-record-array

convert a numpy matrix to a sparse matrix

from scipy import sparse

m = np.random.randint(5, size=(4, 6))
sm = sparse.csr_matrix(m)

select

matrix[0]
# equals to
matrix[0, :]
# output:
# array([ 7.,  8.,  6.,  5.])

# select the second row and all columns
matrix[1]
# output:
# array([ 4.,  2.,  1.,  9.])

# select all rows and the second column
matrix[:, 1]
# output:
# array([ 8.,  2.])

# matrix[row, column]
matrix[0][0] == matrix[0, 0] == 7.0

# select all rows and columns with index 0, 1, 2
matrix[:, 1:3]
# output:
# array([[ 8.,  6.],
#        [ 2.,  1.]])

compare

Numpy could make comparisons across an entire array.

y = np.array([4, 9, 6, 3, 1])

less5 = y < 5
# output:
# array([True, False, False,  True,  True], dtype=bool)

y[less5]
# output:
# array([4, 3, 1])

np_positions = np.array(['GK', 'M', 'A', 'D', 'D', 'M'])
np_heights = np.array([191, 184, 185, 183, 179, 179])

# extract all the heights of the goalkeepers
gk_heights = np_heights[np_positions == 'GK']
# output:
# array([191])

# extract all the heights of the all the other players
other_heights = np_heights[np_positions != 'GK']
# output:
# array([184, 185, 183, 179, 179])

countries_canada = world_alcohol[world_alcohol[:, 2] == 'Canada']
# output:
# array([['1984', 'Americas', 'Canada', 'Spirits', '3.35'],
#        ['1989', 'Americas', 'Canada', 'Wine', '1.27'],
#        ['1984', 'Americas', 'Canada', 'Beer', '5'],
#        ['1985', 'Americas', 'Canada', 'Beer', '4.94'],
#        ...

years_1984 = world_alcohol[world_alcohol[:, 0] == '1984']
# output:
# array([['1984', 'Africa', 'Nigeria', 'Other', '6.1'],
#        ['1984', 'Eastern Mediterranean', 'Afghanistan', 'Other', '0'],
#        ['1984', 'Americas', 'Costa Rica', 'Wine', '0.06'],
#        ...

is_algeria_and_1986 = (world_alcohol[:, 0] == '1986') & (world_alcohol[:, 2] == 'Algeria')
rows_with_algeria_and_1986 = world_alcohol[is_algeria_and_1986, :]

select non NaN rows

df[np.isnan(df['similarity']) == False]

test whether each element of a 1-D array is also present in a second array.

all_array = np.array(['vinta/awesome-python', 'vinta/pangu.js', 'django/django', 'kennethreitz/requests'])
my_array = np.array(['vinta/awesome-python', 'django/django'])
mask = np.in1d(all_array, my_array, assume_unique=True)
vector = mask.astype(int)
# output:
# array([1, 0, 1, 0])

ref:
http://stackoverflow.com/questions/7088625/what-is-the-most-efficient-way-to-check-if-a-value-exists-in-a-numpy-array

replace

matrix = np.array([
    [5, 10, 15], 
    [20, 25, 30],
    [35, 40, 45],
])
second_column_25 = matrix[:, 1] == 25
matrix[second_column_25, 1] = 100
# output:
# array([[  5,  10,  15],
#        [ 20, 100,  30],
#        [ 35,  40,  45]])

# replace all instances of the string 1986 in the first column of world_alcohol with the string 2014
world_alcohol[world_alcohol[:, 0] == "1986", 0] = "2014"

compute

weights = np.array([81.6,  97.0,  95.2])
heights = np.array([1.9,  1.8,  1.7])
bmis = weights / (heights ** 2)
# output:
# array([22.60387812, 29.9382716, 32.94117647])

light_bmi = bmis[bmis <= 30]
# output:
# array([22.60387812, 29.9382716])

mat = np.arange(start=1, stop=7).reshape(3, 2)
# output 
# array([[1, 2],
#        [3, 4],
#        [5, 6]])

mat * np.array([10, 100])
# output:
# array([[ 10, 200],
#        [ 30, 400],
#        [ 50, 600]])

# type coercion
np.array([True, 1, 2]) + np.array([3, 4, False])
# output:
# array([4, 5, 2])

# type convertion
arr = np.array([0.8, 0.4, 2.5, 5.123])
arr = arr.astype(int)

matrix = np.array([
    [5, 10, 15], 
    [20, 25, 30],
    [35, 40, 45],
])

# perform the operation on each row
matrix.sum(axis=1)
# output:
# array([ 30,  75, 120])

# perform the operation on each column
matrix.sum(axis=0)
# output:
# array([60, 75, 90])

dot product 內積

# for 1-D array it is dot product of two arrays
a = np.array([1, 3, -5])
b = np.array([4, -2, -1])
a.dot(b)
# output:
# 3

# for 2-D arrays it is equivalent to matrix multiplication
# a 矩陣的 row 為係數表,b 矩陣為向量表
# 1[3, 1] + 0[2, 1] + 2[1, 0]
# -1[3, 1] + 3[2, 1] + 1[1, 0]
a = np.array((
    (1, 0, 2),
    (-1, 3, 1)
))
b = np.array((
    (3, 1),
    (2, 1),
    (1, 0)
))
a.dot(b)
# output:
# array([[5, 1],
#        [4, 2]])

ref:
https://zh.wikipedia.org/wiki/%E7%9F%A9%E9%99%A3%E4%B9%98%E6%B3%95

sparsity of a matrix

sparsity = float(len(ratings.nonzero()[0])) / (ratings.shape[0] * ratings.shape[1])
print('Sparsity: {0:.2f}%'.format(sparsity * 100)) 
# Sparsity: 6.30%

ref:
http://stackoverflow.com/questions/38708621/how-to-calculate-percentage-of-sparsity-for-a-numpy-array-matrix

statistics

height = np.round(np.random.normal(1.76, 0.20, 5000), 2)
weight = np.round(np.random.normal(60.32, 15, 5000), 2)
city = np.column_stack((height, weight))
# output:
# array([[  1.48,  60.49],
#        [  1.48,  51.5 ],
#        [  1.73,  54.61],
#        ..., 
#        [  1.55,  61.13],
#        [  1.3 ,  55.36],
#        [  1.5 ,  72.42]])

# 平均值
np.mean(height)

# 中位數
np.median(height)

# 相關係數
np.corrcoef(city[:, 0], city[:, 1])
# output:
# array([[ 1.        ,  0.02700209],
#        [ 0.02700209,  1.        ]])

# 標準差(Standard Deviation)
np.std(city[:, 0])
# 0.19767575712767613

ref:
https://campus.datacamp.com/courses/intro-to-python-for-data-science/chapter-4-numpy?ex=8
https://campus.datacamp.com/courses/intro-to-python-for-data-science/chapter-4-numpy?ex=13

IPython: Another Python REPL interpreter

IPython: Another Python REPL interpreter

IPython is a neat alternative of Python's builtin REPL (Read–Eval–Print Loop) interpreter, also a kernel of Jupyter.

ref:
https://ipython.org/
https://jupyter.org/

Useful Commands

# cheatsheet
%quickref

# show details of any objects (including modules, classes, functions and variables)
random?
os.path.join?
some_variable?

# show source code of any objects
os.path.join??

# show nothing for a function that is not implemented in Python
len??

# run some shell commands directly in IPython
pwd
ll
cd
cp
rm
mv
mkdir new_folder

# run any shell command with ! prefix
!ls
!ping www.google.com
!youtube-dl

# assign command output to a variable
contents = !ls
print(contents)

ref:
http://ipython.readthedocs.io/en/stable/interactive/tutorial.html

Magic Functions

# list all magic functions
%lsmagic

# run a Python script and load objects into current session
%run my_script.py

# run a profiling for multi-line code
%%timeit
array = []
for i in xrange(100):
    array.append(i)

# paste multi-line code
%paste
%cpaste

# explore objects
%pdoc some_object
%pdef some_object
%psource some_object
%pfile some_object

# if you call it after hitting an exception, it will automatically open ipdb at the point of the exception
%debug

# the In object is a list which keeps track of the commands in order
print(In)

# the Out object is a dictionary mapping input numbers to their outputs
pinrt(Out)
print(Out[2], _2)

ref:
http://ipython.readthedocs.io/en/stable/interactive/magics.html
https://www.safaribooksonline.com/library/view/python-data-science/9781491912126/ch02.html

Speed up Python and Node.js builds on Travis CI

Speed up Python and Node.js builds on Travis CI

Travis CI's caching archives all directories listed in the configuration and uploads them to Amazon S3. Cached contents are available to any build on the repository, including Pull Requests. For Python and Node.js projects, you could cache both site-packages and node_modules directories in every Travis CI build.

Here is an example of .travis.yml:

sudo: false

language: python

python:
  - "2.7"

node_js: 4

cache:
  directories:
    - $HOME/.cache/pip
    - $HOME/virtualenv/python2.7.9/lib/python2.7/site-packages
    - node_modules

before_install:
  - pip install -U pip

install:
  - pip install -r requirements.txt
  - pip install coverage --ignore-installed
  - npm install

script:
  - coverage run manage.py test

In the case of mine, after applying these changes, the installation time of pip and npm reduces from 180 seconds to 5 seconds.

One thing should be mentioned here: Since we didn't specify any bin folder in the configuration (and I don't think that's necessary), any execution file that being installed by pip such as coverage or django-admin.py will not exist in subsequent builds. If you need those commands, you could just force install them by adding pip install some_package --ignore-installed.

References:

Caching Dependencies and Directories
https://docs.travis-ci.com/user/caching/

How to cache requirements for a Django project on Travis-CI?
http://stackoverflow.com/questions/19422229/how-to-cache-requirements-for-a-django-project-on-travis-ci

如何在 Travis CI 加快 Python 單元測試速度
https://tzangms.com/how-to-speed-up-python-unit-test-on-travis-ci/

Integrate with webpages using CasperJS (built on top of PhantomJS)

Integrate with webpages using CasperJS (built on top of PhantomJS)

PhantomJS is a headless and scriptable WebKit runtime (aka browser) with JavaScript API.

Usage

in script.js

Login and delete spare movie tags on Douban.

var casper = require('casper').create({
  pageSettings: {
    loadImages: true,
    loadPlugins: false
  },
  logLevel: 'debug',
  verbose: true
});

// save session cookies
var fs = require('fs');
var page = require('webpage').create();

var cookieFile = 'cookies.json';

var saveSessionCookie = function() {
  try {
    fs.statSync(cookieFile);
  } catch (e) {
    fs.write(cookieFile, JSON.stringify(phantom.cookies), 'w');
  }
}

if (fs.isFile(cookieFile)) {
  Array.prototype.forEach.call(JSON.parse(fs.read(cookieFile)), function(x) {
    phantom.addCookie(x);
  });
}

// script
var loginUrl = 'https://accounts.douban.com/login';
var startUrl = 'https://movie.douban.com/people/vinta/all';

var tags_do_not_delete = [
  '丹麦', '新西兰', '新加坡', '以色列', '印度', '意大利', '瑞典', '墨西哥', '俄罗斯', '西班牙', '比利时'
];

casper.start(loginUrl, function() {
  this.echo(this.getCurrentUrl());
  this.echo(this.getTitle());

  this.capture('login.png');

  var data = {
    form_email: 'xxx',
    form_password: 'xxx'
  };

  // 可能會被豆瓣要求輸入驗證碼
  // 可以用 casperjs script.js --remote-debugger-port=9000
  // 先打開 login.png 看驗證碼是什麼
  // 到 http://127.0.0.1:9000/ 的 console 手動輸入驗證碼
  // data['captcha-solution'] = '123';

  this.waitForSelector('form#lzform');
  this.fill('form#lzform', data, true);
});

casper.then(function() {
  this.echo(this.getCurrentUrl());
  this.echo(this.getTitle());

  saveSessionCookie();

  this.capture('all.png');

  this.open(startUrl).then(function() {
    this.waitForSelector('#open_tags', function() {
      this.click('#open_tags');
    });

    this.waitWhileSelector('#open_tags');
  });
});

casper.then(function() {
  this.echo(this.getCurrentUrl());
  this.echo(this.getTitle());

  var links = this.evaluate(function() {
    var tagList = document.querySelectorAll('ul.tag-list li a');
    var theLinks = Array.prototype.map.call(tagList, function(elem) {
        return {
          tag: elem.textContent.trim(),
          href: elem.getAttribute('href'),
          count: parseInt(elem.nextElementSibling.textContent, 10)
        };
    });

    return theLinks;
  });

  var filteredLinks = links.filter(function(link) {
    if (link.count < 5 && tags_do_not_delete.indexOf(link.tag) == -1) {
      return true;
    }
    return false;
  });

  this.each(filteredLinks, function(self, link) {
    this.echo(link.tag + ', ' + link.count);

    self.thenOpen(link.href, function() {
      this.echo(this.getCurrentUrl());
      this.echo(this.getTitle());

      this.waitForSelector('#tag-del', function() {
        this.click('#tag-del');
      });

      this.waitForSelector('input[name="del_submit"]', function() {
        this.click('input[name="del_submit"]');
      });
    });
  });
});

casper.run();

To evaluate JavaScript code in the context of the webpage, you must use evaluate() function. The context is a sandbox.

ref:
http://docs.casperjs.org/en/latest/modules/index.html

ref:
https://github.com/vinta/playground/blob/master/casperjs/script.js

Save session cookies

--cookies-file=xxx.txt only store non-session cookies (which remain your logged-in or authenticated status). You have to save every cookie manually.

var casper = require('casper').create();

// save session cookies
var fs = require('fs');
var page = require('webpage').create();

var cookieFile = 'cookies.json';

var saveSessionCookie = function() {
  try {
    fs.statSync(cookieFile);
  } catch (e) {
    fs.write(cookieFile, JSON.stringify(phantom.cookies), 'w');
  }
}

if (fs.isFile(cookieFile)) {
  Array.prototype.forEach.call(JSON.parse(fs.read(cookieFile)), function(x) {
    phantom.addCookie(x);
  });
}

casper.start('yourUrl', function() {
  // do your shit
});

ref:
http://stackoverflow.com/questions/18739354/how-can-i-use-persisted-cookies-from-a-file-using-phantomjs

Run

$ docker run --rm -v `pwd`:/data vinta/casperjs:1.1.3 script.js

# or

$ brew install casperjs
$ casperjs script.js --disk-cache=true

ref:
https://hub.docker.com/r/vinta/casperjs/
https://hub.docker.com/r/zopanix/casperjs/

ref:
http://phantomjs.org/api/command-line.html

Run in debugging mode

$ casperjs script.js --remote-debugger-port=9000
$ open http://127.0.0.1:9000/
  • Click the first link (something like "file:///usr/local/Cellar/xxx").
  • In Sources tab, press "Enable Debugging" button.
  • In Console tab, type "__run();" to start.
  • Once breakpoints worked, you could go to Console tab to debug.

ref:
http://phantomjs.org/troubleshooting.html