We want to test how fast we can compute an economic model using GPU computing tools.
To do so we benchmark our code with a well known economic problem: a consumption savings model. We build off the model and codes outlined in A Practical Guide to Parallel Computing in Macroeconomics by Jesús Fernández-Villaverde and David Zarruk Valencia, repository here.
We benchmark three python parallel programming methods:
Cupy is a numpy-like package that automatically parralizes vector operation using GPUs. For this reason we benchmark it against a similar vector implementation using numpy. We also benchmark against numba, which was the fasted non-GPU based python pacakge in our tests. Numba is a just-in-time compiler that can get speeds close to C in python.
Headline results:
Cupy is almost 75x faster than our best alternative using Numba (~3 seconds)! It is also a significant speed-up relative to the best time reposted by Fernández-Villaverde and Zarruk: 1.42 seconds in a GPU implementation using Open ACC and C++.
We ran this on a machine with a Intel i7-9750H CPU with 6cores/12threads, and a Nvidia GeForce GTX 1650 GPU with 896 CUDA cores.
The code to set parameters and model objects is taken directly ffrom
#--------------------------------#
# House-keeping #
#--------------------------------#
from numba import jit, njit, prange, int64, float64, vectorize, guvectorize
from numba.experimental import jitclass
import numpy
import numpy as np
import math
import time
from scipy.stats import norm
from collections import OrderedDict
import sys
#added by brian
from numba import cuda
import cupy as cp
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 14})
plt.rcParams.update({'lines.linewidth': 3.0})
plt.style.use('default')
#--------------------------------#
# Initialization #
#--------------------------------#
# Number of workers
# Grid for x
nx = 1500;
xmin = 0.1;
xmax = 4.0;
# Grid for e: parameters for Tauchen
ne = 15;
ssigma_eps = 0.02058;
llambda_eps = 0.99;
m = 1.5;
# Utility function
ssigma = 2;
bbeta = 0.97;
T = 10;
# Prices
r = 0.07;
w = 5;
# Initialize the grid for X
xgrid = numpy.zeros(nx)
# Initialize the grid for E and the transition probability matrix
egrid = numpy.zeros(ne)
P = numpy.zeros((ne, ne))
#--------------------------------#
# Grid creation #
#--------------------------------#
# Function to construct the grid for capital (x)
size = nx;
xstep = (xmax - xmin) /(size - 1);
it = 0;
for i in range(0,nx):
xgrid[i] = xmin + it*xstep;
it = it+1;
# Function to construct the grid for productivity (e) using Tauchen (1986)
size = ne;
ssigma_y = math.sqrt(math.pow(ssigma_eps, 2) / (1 - math.pow(llambda_eps,2)));
estep = 2*ssigma_y*m / (size-1);
it = 0;
for i in range(0,ne):
egrid[i] = (-m*math.sqrt(math.pow(ssigma_eps, 2) / (1 - math.pow(llambda_eps,2))) + it*estep);
it = it+1;
# Function to construct the transition probability matrix for productivity (P) using Tauchen (1986)
mm = egrid[1] - egrid[0];
for j in range(0,ne):
for k in range(0,ne):
if (k == 0):
P[j, k] = norm.cdf((egrid[k] - llambda_eps*egrid[j] + (mm/2))/ssigma_eps);
elif (k == ne-1):
P[j, k] = 1 - norm.cdf((egrid[k] - llambda_eps*egrid[j] - (mm/2))/ssigma_eps);
else:
P[j, k] = norm.cdf((egrid[k] - llambda_eps*egrid[j] + (mm/2))/ssigma_eps) - norm.cdf((egrid[k] - llambda_eps*egrid[j] - (mm/2))/ssigma_eps);
# Exponential of the grid e
for i in range(0,ne):
egrid[i] = math.exp(egrid[i]);
#--------------------------------#
# Structure and function #
#--------------------------------#
# Value function
VV = math.pow(-10.0, 5);
specs = OrderedDict()
specs['ind'] = int64
specs['ne'] = int64
specs['nx'] = int64
specs['T'] = int64
specs['age'] = int64
specs['P'] = float64[:,:]
specs['xgrid'] = float64[:]
specs['egrid'] = float64[:]
specs['ssigma'] = float64
specs['bbeta'] = float64
specs['w'] = float64
specs['r'] = float64
specs['V'] = float64[:,:,:]
# Data structure of state and exogenous variables
@jitclass(specs)
class modelState(object):
def __init__(self,ind,ne,nx,T,age,P,xgrid,egrid,ssigma,bbeta,w,r,V):
self.ind = ind
self.ne = ne
self.nx = nx
self.T = T
self.age = age
self.P = P
self.xgrid = xgrid
self.egrid = egrid
self.ssigma = ssigma
self.bbeta = bbeta
self.w = w
self.r = r
self.V = V
# Function that returns value for a given state
# ind: a unique state that corresponds to a pair (ie,ix)
@njit
def value_func(states):
ind = states.ind
ne = states.ne
nx = states.nx
T = states.T
age = states.age
P = states.P
xgrid = states.xgrid
egrid = states.egrid
ssigma = states.ssigma
bbeta = states.bbeta
w = states.w
r = states.r
V = states.V
ix = int(math.floor(ind/ne));
ie = int(math.floor(ind%ne));
VV = math.pow(-10.0, 3)
for ixp in range(0,nx):
expected = 0.0;
if(age < T-1):
for iep in range(0,ne):
expected = expected + P[ie, iep]*V[age+1, ixp, iep]
cons = (1 + r)*xgrid[ix] + egrid[ie]*w - xgrid[ixp];
utility = math.pow(cons, (1-ssigma))/(1-ssigma) + bbeta*expected;
if(cons <= 0):
utility = math.pow(-10.0,5);
if(utility >= VV):
VV = utility;
utility = 0.0;
return[VV];
@njit(parallel=True)
def compute(age, V):
for ind in prange(0,nx*ne):
states = modelState(ind, ne, nx, T, age, P, xgrid, egrid, ssigma, bbeta, w, r, V)
ix = int(math.floor(ind/ne));
ie = int(math.floor(ind%ne));
V[age, ix, ie] = value_func(states)[0];
return(V)
start3 = time.time()
# Initialize value function V
V = numpy.zeros((T, nx, ne))
for age in range(T-1, -1, -1):
V = compute_nonparrallel(age, V)
finish2 = time.time() - start3
print("Age: ", age+1, ". Time: ", round(finish2, 4), " seconds.")
finish_nonpar = time.time() - start3
print("TOTAL ELAPSED TIME (not parrallelized): ", round(finish_nonpar , 4), " seconds. \n")
start = time.time()
# Initialize value function V
V = numpy.zeros((T, nx, ne))
for age in range(T-1, -1, -1):
V = compute(age, V)
finish = time.time() - start
print("Age: ", age+1, ". Time: ", round(finish, 4), " seconds.")
finish = time.time() - start
print("TOTAL ELAPSED TIME: ", round(finish, 4), " seconds. \n")
#---------------------#
# Some checks #
#---------------------#
print(" - - - - - - - - - - - - - - - - - - - - - \n")
print("The first entries of the value function: \n")
for i in range(0,3):
print(round(V[0, 0, i], 5))
print(" \n")
# Bellman equation with assets x as endog state
#
# v(x,e,t) = max_{x'} { u((1+r)x+wy-x') + beta * E[ v(x',y',t+1)|y] }
#
nstates=nx*ne
V = np.zeros((nx*ne,T))
b = np.zeros((nx*ne,T))
xgrid_vec = xgrid[np.newaxis,:]
# felicity as function of state & choice (nx x ne)
# preset consumption, and penalize violation of budget constraint
#testing dimentions
#temp1=(1+r)*np.kron(np.ones((ne,1)),xgrid[:,np.newaxis])
#temp2 = w* np.kron(egrid[:,np.newaxis],np.ones((nx,1)))
#print(temp1.shape,temp2.shape)
#temp3 = -np.ones((nstates,1))*xgrid
#print(temp1.shape,temp2.shape,temp3.shape)
#cons =temp1 +temp2 +temp3
cons = (1+r)*np.kron(np.ones((ne,1)),xgrid[:,np.newaxis]) +w* np.kron(egrid[:,np.newaxis],np.ones((nx,1))) -np.ones((nstates,1))*xgrid
#print(cons.shape)
util=(-10**(10))*np.ones((nstates,nx))
#print(util.shape)
util[cons>0]=(cons[cons>0]**(1-ssigma))/(1-ssigma)
#print(util.shape)
tempV=np.zeros(nstates)
longetrans=np.kron(P,np.ones((1,nx)))
picker=np.kron(np.ones((ne,1)),np.eye(nx))
#print(picker.shape,longetrans.shape, tempV.shape)
#(tempV[:,np.newaxis] @ np.ones((1,nx))).shape
start = time.time()
for t in range(T-1, -1, -1):
# Bellman eqn
print(t)
smallcont= longetrans @ (picker* (tempV[:,np.newaxis] @ np.ones((1,nx)) ) )
cont=np.kron(smallcont,np.ones((nx,1)))
v_before_max = util + bbeta #* cont
#print(v_before_max.shape)
tempV= np.copy(np.amax(v_before_max,axis=1))
policyind = np.argmax(util + bbeta * cont,axis=1)
#print(tempV.shape,V.shape)
V[:,t]= np.copy(tempV)
b[:,t]=xgrid[policyind]
finish = time.time() - start
print(['Age: ', t, '. Time: ', round(finish, 3), ' seconds.'])
finish_nonpar = time.time() - start
print("TOTAL ELAPSED TIME (not parrallelized): ", round(finish_nonpar , 4), " seconds. \n")
print( " - - - - - - - - - - - - - - - - - - - - - \n")
print( "The first entries of the value function: \n")
for i in range(0,3):
print(round(V[i, 0], 5))
print(" \n")
plt.plot(b[:,1])
plt.title('Policy function')
#save result to compare
b_non_gpu = b[0:100,1]
Convert vectorized numpy code to cupy. This is easy, replace np with cp everywhere. We just need to ensure all the vectors are initiaized with cupy.
P=cp.array(P)
xgrid=cp.array(xgrid)
egrid=cp.array(egrid)
#store value & policy functions
V = cp.zeros((nx*ne,T))
b = cp.zeros((nx*ne,T))
nstates=nx*ne
#
# v(x,e,t) = max_{x'} { u((1+r)x+wy-x') + beta * E[ v(x',y',t+1)|y] }
#
V = cp.zeros((nx*ne,T))
b = cp.zeros((nx*ne,T))
# felicity as function of state & choice (nx x ne)
# preset consumption, and penalize violation of budget constraint
cons = (1+r)*cp.kron(cp.ones((ne,1)),xgrid[:,cp.newaxis]) +w* cp.kron(egrid[:,cp.newaxis],cp.ones((nx,1))) -cp.ones((nstates,1))*xgrid
#print(cons.shape)
util=(-10**(10))*cp.ones((nstates,nx))
#print(util.shape)
util[cons>0]=(cons[cons>0]**(1-ssigma))/(1-ssigma)
#print(util.shape)
tempV=cp.zeros(nstates)
longetrans=cp.kron(P,cp.ones((1,nx)))
picker=cp.kron(cp.ones((ne,1)),cp.eye(nx))
#print(picker.shape,longetrans.shape, tempV.shape)
#(tempV[:,np.newaxis] @ np.ones((1,nx))).shape
start = time.time()
for t in range(T-1, -1, -1):
# Bellman eqn
print(t)
smallcont= longetrans @ (picker* (tempV[:,cp.newaxis] @ cp.ones((1,nx)) ) )
cont=cp.kron(smallcont,cp.ones((nx,1)))
v_before_max = util + bbeta #* cont
#print(v_before_max.shape)
tempV= cp.copy(cp.amax(v_before_max,axis=1))
policyind = cp.argmax(util + bbeta * cont,axis=1)
#print(tempV.shape,V.shape)
V[:,t]= cp.copy(tempV)
b[:,t]=xgrid[policyind]
finish = time.time() - start
print(['Age: ', t, '. Time: ', round(finish, 3), ' seconds.'])
finish_nonpar = time.time() - start
print("TOTAL ELAPSED TIME (cupy parrallelized): ", round(finish_nonpar , 4), " seconds. \n")
We print the first ten elements of the policy function, and then the difference between them
print(b_non_gpu[:10])
print(b[:10,1])
print('Difference = ', cp.array(b_non_gpu[:10])-b[:10,1] )
print(r'Speedup: (vs numpy vectorized)', round(10/0.07,3))
print(r'Speedup: (vs numba)', round(3.24/0.043,3))
fig = plt.figure(figsize=(9,5))
ax = fig.add_axes([0,0,1,1])
langs = ['Numba (no parallel)', 'Numpy (CPU, vectorized)', 'Numba (CPU, non-vectorized)' , 'Cupy (GPU, vectorized)']
speed = [13.7, 10.8425,3.2096,0.2606]
ax.bar(langs,speed, color='gray' )
ax.set_ylabel('Time, seconds')
ax.set_title('Comparison of speed of parallel programming packages')
fig.savefig('parallel_speeds.png')
plt.show()