Why Is This Python Code Faster Than Fortran Wrapper with F2PY?

Question

I am writing a proof-of-concept implementation of Newton's method for minimizing the negative log-likelihood term in a logistic regression model. I'm comparing the performance of a native python implementation and a fortran implementation where I link via f2py. Surprisingly, the Fortran version is slower than the Python version. Does anyone have any idea why this would be the case, or have any suggestions for how to improve the Fortran code's performance?

Here's the Python

import numpy as np
import scipy.linalg as la
def logistic(z):
    return np.exp(z)/(1+np.exp(z))
def f(X, y, beta):
    logist = logistic(X@beta)
    return (-ynp.log(logist) - (1-y)np.log(1-logist)).mean()
def gradf(X, y, beta):
    logist = logistic(X@beta)
    return ((logist-y)[:,None]*X).mean(axis=0)
def hessf(X, y, beta):
    logist = logistic(X@beta)
    D = np.diag(logist*(1-logist))
    return X.T @ D @ X/X.shape[0]
def Newtons_logreg(X, y, beta_init, max_iter=10000, tol=1e-4):
    beta = beta_init.copy()
    i = 0
    while i < max_iter:
        #g = np.linalg.inv(hessf(X, y, beta)) @ gradf(X, y, beta) #compute inverse
        #g = np.linalg.solve(hessf(X, y, beta), gradf(X, y, beta)) #general system solve
        g = la.solve(hessf(X, y, beta), gradf(X, y, beta), assume_a='pos')
        beta -= g
        if np.linalg.norm(g) < tol:
            return beta
    Warning("Convergence not reached. Increase iterations or decrease tolerance")
    return beta

The Fortran

! FILE: FORTRAN_VERSION.F
! Compile with f2py -c -m fortran_v fortran_version.f90
    SUBROUTINE logistic(Z, N, W)
!
!   Compute the logistic function of an array Z of length N
!
    implicit none
    INTEGER N
    REAL*8 Z(N)
    REAL*8 W(N)
!f2py intent(in) z
!f2py integer intent(hide),depend(z) :: n=shape(z,0)
!f2py intent(out) w
    W = EXP(Z)/(1D0+EXP(Z))
    END
SUBROUTINE f(X, y, beta, N, M, L)

!
!   Compute the logistic regression likelihood w/ design matrix X,
!   response vector y and coefficient vector beta
!
    implicit none
    INTEGER N,M
    REAL8 S(N), y(N), X(N,M), beta(M), L
!f2py intent(in) X,y,beta
!f2py integer intent(hide),depend(X) :: n=shape(X,0), m = shape(X,1)
!f2py intent(out) L
    CALL logistic (MATMUL(X, RESHAPE(beta, (/M,1/))), N, S)
    L = SUM((-yLOG(S) - (1-y)*LOG(1D0-S))/FLOAT(N))
    END
SUBROUTINE gradf(X, y, beta, N, M, G)

!
!   Compute the gradient of the logistic regression likelihood f
!
    implicit none
    INTEGER N,M
    REAL*8 S(N), y(N), X(N,M), beta(M), G(M)
!f2py intent(in) X,y,beta
!f2py integer intent(hide),depend(X) :: n=shape(X,0), m = shape(X,1)
!f2py intent(out) G
    CALL logistic (MATMUL(X, RESHAPE(beta, (/M,1/))), N, S)
    !may be able to use a ptr instead
    G = RESHAPE(MATMUL(RESHAPE(S-y, (/1,N/))/float(N), X), (/M/)) 
    END
SUBROUTINE hessf(X, beta, N, M, H)

!
!   Compute the Hessian of the logistic regression likelihood f
!
    implicit none
    INTEGER N,M
    REAL8 S(N), X(N,M), beta(M), H(M,M)
!f2py intent(in) X,beta
!f2py integer intent(hide),depend(X) :: n=shape(X,0), m = shape(X,1)
!f2py intent(out) H
    CALL logistic (MATMUL(X, RESHAPE(beta, (/M,1/))), N, S) 
    H = MATMUL(TRANSPOSE(X), SPREAD(S(1D0-S)/float(N), 2, M)*X)
    END
SUBROUTINE Newtons_logreg(X, y, beta_i, max_iter, tol, N, M) 

!
!   Compute the maximum likelihood estimate of logistic regression w/ design
!   matrix X and response variable y
!
    implicit none
    INTEGER N,M,it,max_iter,O
    REAL8 y(N), X(N,M), tol
    REAL8 beta_i(M)
    REAL*8 H(M,M), G(M)
!f2py intent(in) X,y,beta_i,max_iter,tol
!f2py integer intent(hide),depend(X) :: n=shape(X,0), m = shape(X,1)
!f2py intent(out) beta_i
    external DPOSV !double precision symmetric positive definite linear solve
    !external DSYSV !double precision symmetric linear solve
    !external DGELSV
    it=0
    DO WHILE ( it .LT. max_iter)
        call hessf(X, beta_i, N, M, H)
        call gradf(X, y, beta_i, N, M, G)
        call DPOSV('U', M, 1, H, M, G, M, O)
        beta_i = beta_i - G
        IF (NORM2(G) .LT. tol) RETURN
        it = it + 1
    ENDDO
    END
! END OF FILE FORTRAN_VERSION.F

The test script

import numpy as np
import sys
sys.path.append("./")
import fortran_v
import python_version
import time
def run_both_tests(n, p, max_iter=10000, tol=1e-4, runs=1000):
    fortran_times = np.zeros(runs)
    fortran_errs = np.zeros(runs)
    python_times = np.zeros(runs)
    python_errs = np.zeros(runs)
    for i in range(runs):
        np.random.seed(i)
        X = np.random.normal(size=(n,p))
        beta = np.random.normal(size=p)
        y = np.random.binomial(n=1, p=python_version.logistic(X@beta), size=n)
        start = time.time()
        beta_init = np.zeros(p)
        python_errs[i] = np.linalg.norm(python_version.Newtons_logreg(X, y, beta_init, max_iter, tol))
        python_times[i] = time.time()-start
        start = time.time()
        beta_init = np.zeros(p)
        fortran_errs[i] = np.linalg.norm(fortran_v.newtons_logreg(X, y, beta_init, max_iter, tol))
        fortran_times[i] = time.time()-start
    print("Mean/Std of Python Runs: {} +/- {}".format(python_times.mean(), python_times.std()))
    print("Mean/Std of Fortran Runs: {} +/- {}".format(fortran_times.mean(), fortran_times.std()))
    print("Mean/Std of Differences in Errors: {} +/- {}".format(np.abs(python_errs-fortran_errs).mean(), np.abs(python_errs-fortran_errs).std()))

and its output

In [4]: test_script.run_both_tests(50000, 500, runs=10)
Mean/Std of Python Runs: 127.61297235488891 +/- 2.759745817424147
Mean/Std of Fortran Runs: 165.95721955299376 +/- 3.9609319826282143
Mean/Std of Differences in Errors: 1.8474111129762605e-14 +/- 1.1435103132435445e-14

Note that this is my first Fortran program, so low hanging fruit is especially appreciated.

A couple of thoughts. (1) The matmul function at least in the past for gcc is actually about the worst thing ever. You would be better off trying to call one of the BLAS routines by far. This may have changed but last time I checked this was still the case. (2) The real penalty that Python pays is in the loops. My brief look through your code suggested that only one loop was actually contained in the Fortran. It may be that you really are not gaining too much by using the Fortran in this instance. — Kyle Mandli, Sep 18 '21 at 00:18
(3) There is a cost for the bridge between Python and Fortran incurred by f2py. I do not think this would lead to your difference though. — Kyle Mandli, Sep 18 '21 at 00:19
My best guess is that the matmul is actually really the problem. You can take a look at some sample Fortran code that compares some matrix multiplication to see some of the issues involved: https://github.com/mandli/methods-in-computational-science/blob/master/src/fortran/matrix_multiply.f90 — Kyle Mandli, Sep 18 '21 at 00:21
Some low-hanging fruit on the Fortran side: try taking out the calls to reshape in your matrix-vector products. Despite the name, matmul does not require both arguments to be rank-2. It can handle matrix-vector products just fine (see https://gcc.gnu.org/onlinedocs/gfortran/MATMUL.html). — Endulum, Sep 18 '21 at 03:19
Another point to consider is that speed in Fortran can depend a lot compiler flags you enable. I don't know what f2py turns on by default with gfortran, but that might be another place to look. — Endulum, Sep 18 '21 at 03:32
This could be a good question to also ask in Fortran discourse where there are many Fortran developers + representatives from GFortran, Intel, NAG, Cray, NVIDIA, and other Fortran compiler developers. — Scientist, Sep 18 '21 at 18:12

Robert Bassett · Answer 1 · 2021-09-20T20:29:54.273

Shoutout to Kyle Mandli and Endulum who each contributed to this answer in the comments.

First, I took Endulum's suggestion and removed the redundant reshapes. After this change the Fortran version was beating Python on small scale examples, but at scale the Python version was still faster. Then I implemented Kyle Mandli's suggestion and replaced all of the matmuls with calls to LAPACK's dgemm. Amazingly, this change took the mean run time from ~166 seconds to less than 4 seconds for the $50000 \times 500$ example.

What I learned

There is an order of magnitude difference between LAPACK and gfortran's implicit linear algebra functions. If you are going to outsource any linear-algebra-heavy Python code to Fortran for speed, USE LAPACK FUNCTIONS. It makes all the difference

Here's the run output and the finalized Fortran version for those who are curious.

In [3]: test_script.run_both_tests(50000, 500, runs=10)
Mean/Std of Python Runs: 125.60022213459015 +/- 4.542739296415188
Mean/Std of Fortran Runs: 3.9391146421432497 +/- 0.057918332079517035
Mean/Std of Differences in Errors: 1.8474111129762605e-14 +/- 1.5305558938321453e-14

And the Fortran file. Note that there's still an additional NORM2 in the stopping condition which can be replaced by LAPACK's DLANGE.

! Compile with f2py -c -m fortran_v fortran_version.f90
    SUBROUTINE logistic(Z, N, W)
!
!   Compute the logistic function of an array Z of length N
!
    implicit none
    INTEGER N
    REAL*8 Z(N)
    REAL*8 W(N)
!f2py intent(in) z
!f2py integer intent(hide),depend(z) :: n=shape(z,0)
!f2py intent(out) w
    W = EXP(Z)/(1D0+EXP(Z))
    END
SUBROUTINE f(X, y, beta, N, M, L)

!
!   Compute the logistic regression likelihood w/ design matrix X,
!   response vector y and coefficient vector beta
!
    implicit none
    INTEGER N,M
    REAL8 S(N), y(N), X(N,M), beta(M), L, Xbeta(N)
!f2py intent(in) X,y,beta
!f2py integer intent(hide),depend(X) :: n=shape(X,0), m = shape(X,1)
!f2py intent(out) L
    !CALL logistic (MATMUL(X, beta), N, S)
    CALL DGEMM('N', 'N', N, 1, M, 1D0, X, N, beta, M, 0D0, Xbeta, N)
    CALL logistic(Xbeta, N, S)
    L = SUM((-yLOG(S) - (1-y)*LOG(1D0-S))/FLOAT(N))
    END
SUBROUTINE gradf(X, y, beta, N, M, G)

!
!   Compute the gradient of the logistic regression likelihood f
!
    implicit none
    INTEGER N,M
    REAL*8 S(N), y(N), X(N,M), beta(M), G(M), Xbeta(N)
!f2py intent(in) X,y,beta
!f2py integer intent(hide),depend(X) :: n=shape(X,0), m = shape(X,1)
!f2py intent(out) G
    !CALL logistic (MATMUL(X, beta), N, S)
    CALL DGEMM('N', 'N', N, 1, M, 1D0, X, N, beta, M, 0D0, Xbeta, N)
    CALL logistic(Xbeta, N, S)
    !may be able to use a ptr instead
    G = MATMUL((S-y)/float(N), X) 
    END
SUBROUTINE hessf(X, beta, N, M, H)

!
!   Compute the Hessian of the logistic regression likelihood f
!
    implicit none
    INTEGER N,M
    REAL8 S(N), X(N,M), beta(M), H(M,M), Xbeta(N)
!f2py intent(in) X,beta
!f2py integer intent(hide),depend(X) :: n=shape(X,0), m = shape(X,1)
!f2py intent(out) H
    !CALL logistic (MATMUL(X, beta), N, S) 
    CALL DGEMM('N', 'N', N, 1, M, 1D0, X, N, beta, M, 0D0, Xbeta, N)
    CALL logistic(Xbeta, N, S)
    CALL DGEMM('T', 'N', M, M, N, 1D0, X, N, SPREAD(S(1D0-S)/float(N), 2, M)*X, N, 0D0, H, M)
    END
SUBROUTINE Newtons_logreg(X, y, beta_i, max_iter, tol, N, M) 

!
!   Compute the maximum likelihood estimate of logistic regression w/ design
!   matrix X and response variable y
!
    implicit none
    INTEGER N,M,it,max_iter,O
    REAL8 y(N), X(N,M), tol
    REAL8 beta_i(M)
    REAL*8 H(M,M), G(M)
!f2py intent(in) X,y,beta_i,max_iter,tol
!f2py integer intent(hide),depend(X) :: n=shape(X,0), m = shape(X,1)
!f2py intent(out) beta_i
    !external DPOSV !double precision symmetric positive definite linear solve
    !external DSYSV !double precision symmetric linear solve
    !external DGELSV
    it=0
    DO WHILE ( it .LT. max_iter)
        call hessf(X, beta_i, N, M, H)
        call gradf(X, y, beta_i, N, M, G)
        call DPOSV('U', M, 1, H, M, G, M, O)
        beta_i = beta_i - G
        IF (NORM2(G) .LT. tol) RETURN
        !IF (O .NE. 0) WRITE('ERROR IN MATRIX SOLVE')
        it = it + 1
    ENDDO
    END
! END OF FILE FORTRAN_VERSION.F

Why Is This Python Code Faster Than Fortran Wrapper with F2PY?

1 Answers1

What I learned