It's possible to combine a lookup table with int.to_bytes (Python 3 only):
popcount8bit = bytes([popcount(x) for x in range(1<<8)]) # use any method to initialize this lookup table
popcount = lambda x: sum(map(popcount8bit.__getitem__,
x.to_bytes((x.bit_length()+7)//8, "little")))
This solution unfortunately is about 20% slower than bin(x).count('1') on Python 3, but twice faster on PyPy3.
This is a benchmark script, compares several different solutions presented here, for different number of bits:
from __future__ import print_function #for Python 2
import sys
from timeit import timeit
import random
def popcount(x): return bin(x).count("1")
version3=sys.version.startswith("3")
for numBit in (2, 4, 8, 16, 31, 32, 63, 64, 1000, 10000):
maximum=int((1<<numBit)-1) #int cast just in case it overflows to long in Python 2
functions=[
(popcount, "bin count"),
(lambda x: "{:b}".format(x).count("1"), "format string count"),
]
try:
import gmpy
functions.append((gmpy.popcount, "gmpy"))
except ImportError:
pass
if sys.version.startswith("3"):
exec('''functions.append((lambda x: f"{x:b}".count("1"), "f-string count"))''')
if numBit<=16:
table1=[popcount(x) for x in range(maximum+1)]
functions.append((lambda x: table1[x], "lookup list"))
functions.append((table1.__getitem__, "lookup list without lambda"))
table2="".join(map(chr, table1))
functions.append((lambda x: ord(table2[x]), "lookup str"))
if version3:
table3=bytes(table1)
functions.append((lambda x: table3[x], "lookup bytes"))
if numBit==8:
functions.append((
b'\x00\x01\x01\x02\x01\x02\x02\x03\x01\x02\x02\x03\x02\x03\x03\x04'
b'\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05'
b'\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05'
b'\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06'
b'\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05'
b'\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06'
b'\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06'
b'\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07'
b'\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05'
b'\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06'
b'\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06'
b'\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07'
b'\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06'
b'\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07'
b'\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07'
b'\x04\x05\x05\x06\x05\x06\x06\x07\x05\x06\x06\x07\x06\x07\x07\x08'
.__getitem__, "lookup bytes hard coded 8 bit"))
table_hardcoded=(
b'\x00\x01\x01\x02\x01\x02\x02\x03\x01\x02\x02\x03\x02\x03\x03\x04'
b'\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05'
b'\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05'
b'\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06'
b'\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05'
b'\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06'
b'\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06'
b'\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07'
b'\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05'
b'\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06'
b'\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06'
b'\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07'
b'\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06'
b'\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07'
b'\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07'
b'\x04\x05\x05\x06\x05\x06\x06\x07\x05\x06\x06\x07\x06\x07\x07\x08')
functions.append((
table_hardcoded.__getitem__, "lookup bytes hard coded 8 bit local variable"))
functions.append((table3.__getitem__, "lookup bytes without lambda"))
if version3:
popcount8bit=bytes([popcount(x) for x in range(1<<8)]) #bytes because benchmark says that it's fastest
functions.append((
lambda x: sum(popcount8bit[x] for x in x.to_bytes((x.bit_length()+7)//8, "big")),
"to_bytes"
))
functions.append((
lambda x: sum(map(popcount8bit.__getitem__, x.to_bytes((x.bit_length()+7)//8, "big"))),
"to_bytes without list comprehension"
))
functions.append((
lambda x: sum(map(popcount8bit.__getitem__, x.to_bytes((x.bit_length()+7)//8, "little"))),
"to_bytes little endian, without list comprehension"
))
#for x in (2, 4, 8, 16, 32, 64):
# table1=[popcount(x) for x in range(1<<8)]
print("====== numBit=", numBit)
data=[]
numRepeat=10**7//(numBit+100)
for popcountFunction, description in functions:
random.seed(10) #make randint returns the same value
data.append((
timeit(lambda: popcountFunction(random.randint(0, maximum)), number=numRepeat),
description
))
time1, name1=data[0]
assert name1=="bin count"
data.sort()
maxLength=0
for time, description in data:
maxLength=max(maxLength, len(description))
for time, description in data:
print("{:{}} -> {:2f} = {} * {:2f}".format(description, maxLength+2, time, name1, time/time1))
It works with both Python 2 and 3; however, if a solution is unavailable for Python 2, it's not measured.
Some solutions are not listed here.
Result:
- Python 2: "lookup list without lambda" is the fastest (25% faster than "bin count", 6% faster than "lookup list" (with lambda)) for <= 16 bits, larger than that "bin count" is the fastest. (I didn't install gmpy for Python 2)
- Python 3: Roughly the same result.
- "Lookup bytes without lambda" is comparable (+/-2% compared to "lookup list without lambda").
- gmpy is faster than "bin count` in all cases, but slower than "lookup list without lambda" for about 5% with numBit <= 16.
- "to_bytes" is comparable.
- Using f-string is about 10% slower than "bin count".
- PyPy3: The lambda no longer incurs much cost, and the
to_bytes version becomes much faster (twice faster than "bin count"); however, I could not get gmpy to install.