a = [1,2,3,4,5,6] * 1000
b = [2,3,4,5,7,6] * 1000A philosophy I like to follow in Python is “Python is slow, let’s code everything and than see if we have any bottleneck we should replace with something else”. If you find such bottlenecks, you can replace them with a faster library or another language.
Replacing Python by Cython is one of the ways to speedup such bottlenecks.
Let’s try to code a correlation computation function in cython and compare it to Python or Numpy.
First, the easy numpy version
import numpy as np
%timeit np.corrcoef(a,b)947 µs ± 113 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Now, a simple pupre Python version.
def correlation(a_samples, b_samples):
a_mean = sum(a_samples) / len(a_samples)
b_mean = sum(b_samples) / len(b_samples)
diff_a_samples = [a - a_mean for a in a_samples]
diff_b_samples = [b - b_mean for b in b_samples]
covariance = sum([diff_a * diff_b for diff_a, diff_b in zip(diff_a_samples, diff_b_samples)])
variance_a = sum(diff_a ** 2 for diff_a in diff_a_samples)
variance_b = sum(diff_b ** 2 for diff_b in diff_b_samples)
correlation = covariance / (variance_a * variance_b) ** (1/2)
return correlation
%timeit correlation(a,b)2.38 ms ± 85.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Let’s now try to build a version in cython.
First, I have to transform the Python lists in C arrays. Then I compute the values one by one, making sure that I don’t leave any Python operations.
%load_ext cythonThe cython extension is already loaded. To reload it, use:
%reload_ext cython
%%cython
import cython
from libc.stdlib cimport malloc, free
from libc.math cimport sqrt
def cython_correlation(a_samples, b_samples):
cdef int a_len = len(a_samples)
cdef int b_len = len(b_samples)
# First we convert the Python lists into C arrays
a_samples_array = <int *>malloc(a_len*cython.sizeof(int))
if a_samples_array is NULL:
raise MemoryError
b_samples_array = <int *>malloc(b_len*cython.sizeof(int))
if b_samples_array is NULL:
raise MemoryError
cdef int i = 0
for i in range(a_len):
a_samples_array[i] = a_samples[i]
b_samples_array[i] = b_samples[i]
# Now we can compute the correlation
# First compute the sum of the arrays
cdef int a_sum = 0
for i in range(a_len):
a_sum += a_samples_array[i]
cdef int b_sum = 0
for i in range(a_len):
b_sum += b_samples_array[i]
# Then we can compute the means
cdef double a_mean
cdef double b_mean
a_mean = a_sum / a_len
b_mean = b_sum / b_len
# We then put the difference to the means in new arrays
diff_a_samples = <double *>malloc(a_len*cython.sizeof(double))
if diff_a_samples is NULL:
raise MemoryError
diff_b_samples = <double *>malloc(b_len*cython.sizeof(double))
if diff_b_samples is NULL:
raise MemoryError
for i in range(a_len):
diff_a_samples[i] = a_samples_array[i] - a_mean
diff_b_samples[i] = b_samples_array[i] - b_mean
# This then allows us to easily compute the
# covariance and variances.
cdef double covariance = 0
for i in range(a_len):
covariance += diff_a_samples[i] * diff_b_samples[i]
cdef double variance_a = 0
cdef double variance_b = 0
for i in range(a_len):
variance_a += diff_a_samples[i] ** 2
variance_b += diff_b_samples[i] ** 2
cdef double correlation = 0
cdef double variance_product = (variance_a * variance_b)
correlation = covariance / sqrt(variance_product)
free(a_samples_array)
free(b_samples_array)
return correlation%timeit cython_correlation(a,b)
# 10000 loops, best of 5: 154 µs per loopNice! We got a 6X improvement compared to Numpy and 15X improvement compared to pure Python. Pretty cool.