8

I am writing a proof-of-concept implementation of Newton's method for minimizing the negative log-likelihood term in a logistic regression model. I'm comparing the performance of a native python implementation and a fortran implementation where I link via f2py. Surprisingly, the Fortran version is slower than the Python version. Does anyone have any idea why this would be the case, or have any suggestions for how to improve the Fortran code's performance?

Here's the Python

import numpy as np
import scipy.linalg as la

def logistic(z): return np.exp(z)/(1+np.exp(z))

def f(X, y, beta): logist = logistic(X@beta) return (-ynp.log(logist) - (1-y)np.log(1-logist)).mean()

def gradf(X, y, beta): logist = logistic(X@beta) return ((logist-y)[:,None]*X).mean(axis=0)

def hessf(X, y, beta): logist = logistic(X@beta) D = np.diag(logist*(1-logist)) return X.T @ D @ X/X.shape[0]

def Newtons_logreg(X, y, beta_init, max_iter=10000, tol=1e-4): beta = beta_init.copy() i = 0 while i < max_iter: #g = np.linalg.inv(hessf(X, y, beta)) @ gradf(X, y, beta) #compute inverse #g = np.linalg.solve(hessf(X, y, beta), gradf(X, y, beta)) #general system solve g = la.solve(hessf(X, y, beta), gradf(X, y, beta), assume_a='pos') beta -= g if np.linalg.norm(g) < tol: return beta Warning("Convergence not reached. Increase iterations or decrease tolerance") return beta

The Fortran

! FILE: FORTRAN_VERSION.F
! Compile with f2py -c -m fortran_v fortran_version.f90
    SUBROUTINE logistic(Z, N, W)
!
!   Compute the logistic function of an array Z of length N
!
    implicit none
    INTEGER N
    REAL*8 Z(N)
    REAL*8 W(N)
!f2py intent(in) z
!f2py integer intent(hide),depend(z) :: n=shape(z,0)
!f2py intent(out) w
    W = EXP(Z)/(1D0+EXP(Z))
    END
SUBROUTINE f(X, y, beta, N, M, L)

! ! Compute the logistic regression likelihood w/ design matrix X, ! response vector y and coefficient vector beta ! implicit none INTEGER N,M REAL8 S(N), y(N), X(N,M), beta(M), L !f2py intent(in) X,y,beta !f2py integer intent(hide),depend(X) :: n=shape(X,0), m = shape(X,1) !f2py intent(out) L CALL logistic (MATMUL(X, RESHAPE(beta, (/M,1/))), N, S) L = SUM((-yLOG(S) - (1-y)*LOG(1D0-S))/FLOAT(N)) END

SUBROUTINE gradf(X, y, beta, N, M, G)

! ! Compute the gradient of the logistic regression likelihood f ! implicit none INTEGER N,M REAL*8 S(N), y(N), X(N,M), beta(M), G(M) !f2py intent(in) X,y,beta !f2py integer intent(hide),depend(X) :: n=shape(X,0), m = shape(X,1) !f2py intent(out) G CALL logistic (MATMUL(X, RESHAPE(beta, (/M,1/))), N, S) !may be able to use a ptr instead G = RESHAPE(MATMUL(RESHAPE(S-y, (/1,N/))/float(N), X), (/M/)) END

SUBROUTINE hessf(X, beta, N, M, H)

! ! Compute the Hessian of the logistic regression likelihood f ! implicit none INTEGER N,M REAL8 S(N), X(N,M), beta(M), H(M,M) !f2py intent(in) X,beta !f2py integer intent(hide),depend(X) :: n=shape(X,0), m = shape(X,1) !f2py intent(out) H CALL logistic (MATMUL(X, RESHAPE(beta, (/M,1/))), N, S) H = MATMUL(TRANSPOSE(X), SPREAD(S(1D0-S)/float(N), 2, M)*X) END

SUBROUTINE Newtons_logreg(X, y, beta_i, max_iter, tol, N, M) 

! ! Compute the maximum likelihood estimate of logistic regression w/ design ! matrix X and response variable y ! implicit none INTEGER N,M,it,max_iter,O REAL8 y(N), X(N,M), tol REAL8 beta_i(M) REAL*8 H(M,M), G(M) !f2py intent(in) X,y,beta_i,max_iter,tol !f2py integer intent(hide),depend(X) :: n=shape(X,0), m = shape(X,1) !f2py intent(out) beta_i external DPOSV !double precision symmetric positive definite linear solve !external DSYSV !double precision symmetric linear solve !external DGELSV it=0 DO WHILE ( it .LT. max_iter) call hessf(X, beta_i, N, M, H) call gradf(X, y, beta_i, N, M, G) call DPOSV('U', M, 1, H, M, G, M, O) beta_i = beta_i - G IF (NORM2(G) .LT. tol) RETURN it = it + 1 ENDDO END

! END OF FILE FORTRAN_VERSION.F

The test script

import numpy as np
import sys
sys.path.append("./")
import fortran_v
import python_version
import time

def run_both_tests(n, p, max_iter=10000, tol=1e-4, runs=1000): fortran_times = np.zeros(runs) fortran_errs = np.zeros(runs) python_times = np.zeros(runs) python_errs = np.zeros(runs) for i in range(runs): np.random.seed(i) X = np.random.normal(size=(n,p)) beta = np.random.normal(size=p) y = np.random.binomial(n=1, p=python_version.logistic(X@beta), size=n) start = time.time() beta_init = np.zeros(p) python_errs[i] = np.linalg.norm(python_version.Newtons_logreg(X, y, beta_init, max_iter, tol)) python_times[i] = time.time()-start start = time.time() beta_init = np.zeros(p) fortran_errs[i] = np.linalg.norm(fortran_v.newtons_logreg(X, y, beta_init, max_iter, tol)) fortran_times[i] = time.time()-start print("Mean/Std of Python Runs: {} +/- {}".format(python_times.mean(), python_times.std())) print("Mean/Std of Fortran Runs: {} +/- {}".format(fortran_times.mean(), fortran_times.std())) print("Mean/Std of Differences in Errors: {} +/- {}".format(np.abs(python_errs-fortran_errs).mean(), np.abs(python_errs-fortran_errs).std()))

and its output

In [4]: test_script.run_both_tests(50000, 500, runs=10)
Mean/Std of Python Runs: 127.61297235488891 +/- 2.759745817424147
Mean/Std of Fortran Runs: 165.95721955299376 +/- 3.9609319826282143
Mean/Std of Differences in Errors: 1.8474111129762605e-14 +/- 1.1435103132435445e-14

Note that this is my first Fortran program, so low hanging fruit is especially appreciated.

Robert Bassett
  • 486
  • 5
  • 12
  • 1
    A couple of thoughts. (1) The matmul function at least in the past for gcc is actually about the worst thing ever. You would be better off trying to call one of the BLAS routines by far. This may have changed but last time I checked this was still the case. (2) The real penalty that Python pays is in the loops. My brief look through your code suggested that only one loop was actually contained in the Fortran. It may be that you really are not gaining too much by using the Fortran in this instance. – Kyle Mandli Sep 18 '21 at 00:18
  • (3) There is a cost for the bridge between Python and Fortran incurred by f2py. I do not think this would lead to your difference though. – Kyle Mandli Sep 18 '21 at 00:19
  • 2
    My best guess is that the matmul is actually really the problem. You can take a look at some sample Fortran code that compares some matrix multiplication to see some of the issues involved: https://github.com/mandli/methods-in-computational-science/blob/master/src/fortran/matrix_multiply.f90 – Kyle Mandli Sep 18 '21 at 00:21
  • 1
    Some low-hanging fruit on the Fortran side: try taking out the calls to reshape in your matrix-vector products. Despite the name, matmul does not require both arguments to be rank-2. It can handle matrix-vector products just fine (see https://gcc.gnu.org/onlinedocs/gfortran/MATMUL.html). – Endulum Sep 18 '21 at 03:19
  • 6
    Another point to consider is that speed in Fortran can depend a lot compiler flags you enable. I don't know what f2py turns on by default with gfortran, but that might be another place to look. – Endulum Sep 18 '21 at 03:32
  • This could be a good question to also ask in Fortran discourse where there are many Fortran developers + representatives from GFortran, Intel, NAG, Cray, NVIDIA, and other Fortran compiler developers. – Scientist Sep 18 '21 at 18:12

1 Answers1

8

Shoutout to Kyle Mandli and Endulum who each contributed to this answer in the comments.

First, I took Endulum's suggestion and removed the redundant reshapes. After this change the Fortran version was beating Python on small scale examples, but at scale the Python version was still faster. Then I implemented Kyle Mandli's suggestion and replaced all of the matmuls with calls to LAPACK's dgemm. Amazingly, this change took the mean run time from ~166 seconds to less than 4 seconds for the $50000 \times 500$ example.

What I learned

There is an order of magnitude difference between LAPACK and gfortran's implicit linear algebra functions. If you are going to outsource any linear-algebra-heavy Python code to Fortran for speed, USE LAPACK FUNCTIONS. It makes all the difference

Here's the run output and the finalized Fortran version for those who are curious.

In [3]: test_script.run_both_tests(50000, 500, runs=10)
Mean/Std of Python Runs: 125.60022213459015 +/- 4.542739296415188
Mean/Std of Fortran Runs: 3.9391146421432497 +/- 0.057918332079517035
Mean/Std of Differences in Errors: 1.8474111129762605e-14 +/- 1.5305558938321453e-14

And the Fortran file. Note that there's still an additional NORM2 in the stopping condition which can be replaced by LAPACK's DLANGE.

! Compile with f2py -c -m fortran_v fortran_version.f90
    SUBROUTINE logistic(Z, N, W)
!
!   Compute the logistic function of an array Z of length N
!
    implicit none
    INTEGER N
    REAL*8 Z(N)
    REAL*8 W(N)
!f2py intent(in) z
!f2py integer intent(hide),depend(z) :: n=shape(z,0)
!f2py intent(out) w
    W = EXP(Z)/(1D0+EXP(Z))
    END
SUBROUTINE f(X, y, beta, N, M, L)

! ! Compute the logistic regression likelihood w/ design matrix X, ! response vector y and coefficient vector beta ! implicit none INTEGER N,M REAL8 S(N), y(N), X(N,M), beta(M), L, Xbeta(N) !f2py intent(in) X,y,beta !f2py integer intent(hide),depend(X) :: n=shape(X,0), m = shape(X,1) !f2py intent(out) L !CALL logistic (MATMUL(X, beta), N, S) CALL DGEMM('N', 'N', N, 1, M, 1D0, X, N, beta, M, 0D0, Xbeta, N) CALL logistic(Xbeta, N, S) L = SUM((-yLOG(S) - (1-y)*LOG(1D0-S))/FLOAT(N)) END

SUBROUTINE gradf(X, y, beta, N, M, G)

! ! Compute the gradient of the logistic regression likelihood f ! implicit none INTEGER N,M REAL*8 S(N), y(N), X(N,M), beta(M), G(M), Xbeta(N) !f2py intent(in) X,y,beta !f2py integer intent(hide),depend(X) :: n=shape(X,0), m = shape(X,1) !f2py intent(out) G !CALL logistic (MATMUL(X, beta), N, S) CALL DGEMM('N', 'N', N, 1, M, 1D0, X, N, beta, M, 0D0, Xbeta, N) CALL logistic(Xbeta, N, S) !may be able to use a ptr instead G = MATMUL((S-y)/float(N), X) END

SUBROUTINE hessf(X, beta, N, M, H)

! ! Compute the Hessian of the logistic regression likelihood f ! implicit none INTEGER N,M REAL8 S(N), X(N,M), beta(M), H(M,M), Xbeta(N) !f2py intent(in) X,beta !f2py integer intent(hide),depend(X) :: n=shape(X,0), m = shape(X,1) !f2py intent(out) H !CALL logistic (MATMUL(X, beta), N, S) CALL DGEMM('N', 'N', N, 1, M, 1D0, X, N, beta, M, 0D0, Xbeta, N) CALL logistic(Xbeta, N, S) CALL DGEMM('T', 'N', M, M, N, 1D0, X, N, SPREAD(S(1D0-S)/float(N), 2, M)*X, N, 0D0, H, M) END

SUBROUTINE Newtons_logreg(X, y, beta_i, max_iter, tol, N, M) 

! ! Compute the maximum likelihood estimate of logistic regression w/ design ! matrix X and response variable y ! implicit none INTEGER N,M,it,max_iter,O REAL8 y(N), X(N,M), tol REAL8 beta_i(M) REAL*8 H(M,M), G(M) !f2py intent(in) X,y,beta_i,max_iter,tol !f2py integer intent(hide),depend(X) :: n=shape(X,0), m = shape(X,1) !f2py intent(out) beta_i !external DPOSV !double precision symmetric positive definite linear solve !external DSYSV !double precision symmetric linear solve !external DGELSV it=0 DO WHILE ( it .LT. max_iter) call hessf(X, beta_i, N, M, H) call gradf(X, y, beta_i, N, M, G) call DPOSV('U', M, 1, H, M, G, M, O) beta_i = beta_i - G IF (NORM2(G) .LT. tol) RETURN !IF (O .NE. 0) WRITE('ERROR IN MATRIX SOLVE') it = it + 1 ENDDO END

! END OF FILE FORTRAN_VERSION.F

Robert Bassett
  • 486
  • 5
  • 12