I wanted to verify Xiaoning Bian's answer, but unfortunately he didn't post his code. So I implemented a little test suite and ran different little hashing functions on the list of 466K English words to see number of collisions for each:
Hash function | Collisions | Time (words) | Time (file)
=================================================================
CRC32 | 23 (0.005%) | 112 ms | 38 ms
MurmurOAAT | 26 (0.006%) | 86 ms | 10 ms
FNV hash | 32 (0.007%) | 87 ms | 7 ms
Jenkins OAAT | 36 (0.008%) | 90 ms | 8 ms
DJB2 hash | 344 (0.074%) | 87 ms | 5 ms
K&R V2 | 356 (0.076%) | 86 ms | 5 ms
Coffin | 763 (0.164%) | 86 ms | 4 ms
x17 hash | 2242 (0.481%) | 87 ms | 7 ms
-----------------------------------------------------------------
MurmurHash3_x86_32 | 19 (0.004%) | 90 ms | 3 ms
I included time for both: hashing all words individually and hashing the entire file of all English words once. I also included a more complex MurmurHash3_x86_32 into my test for reference.
Conclusion:
- there is almost no point of using the popular DJB2 hash function for strings on Intel x86-64 architecture. Because it has much more collisions than similar functions (MurmurOAAT, FNV and Jenkins OAAT) while having very similar throughput. Bernstein's DJB2 performs especially bad on short strings. Example collisions:
Liz/MHz, Bon/COM, Rey/SEX.
Test code:
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#define MAXLINE 2048
#define SEED 0x12345678
uint32_t DJB2_hash(const uint8_t *str)
{
uint32_t hash = 5381;
uint8_t c;
while ((c = *str++))
hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
return hash;
}
uint32_t FNV(const void* key, int len, uint32_t h)
{
// Source: https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp
h ^= 2166136261UL;
const uint8_t* data = (const uint8_t*)key;
for(int i = 0; i < len; i++)
{
h ^= data[i];
h *= 16777619;
}
return h;
}
uint32_t MurmurOAAT_32(const char* str, uint32_t h)
{
// One-byte-at-a-time hash based on Murmur's mix
// Source: https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp
for (; *str; ++str) {
h ^= *str;
h *= 0x5bd1e995;
h ^= h >> 15;
}
return h;
}
uint32_t KR_v2_hash(const char *s)
{
// Source: https://stackoverflow.com/a/45641002/5407270
uint32_t hashval = 0;
for (hashval = 0; *s != '\0'; s++)
hashval = *s + 31*hashval;
return hashval;
}
uint32_t Jenkins_one_at_a_time_hash(const char *str, size_t len)
{
uint32_t hash, i;
for(hash = i = 0; i < len; ++i)
{
hash += str[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
return hash;
}
uint32_t crc32b(const uint8_t *str) {
// Source: https://stackoverflow.com/a/21001712
unsigned int byte, crc, mask;
int i = 0, j;
crc = 0xFFFFFFFF;
while (str[i] != 0) {
byte = str[i];
crc = crc ^ byte;
for (j = 7; j >= 0; j--) {
mask = -(crc & 1);
crc = (crc >> 1) ^ (0xEDB88320 & mask);
}
i = i + 1;
}
return ~crc;
}
inline uint32_t _rotl32(uint32_t x, int32_t bits)
{
return x<<bits | x>>(32-bits); // C idiom: will be optimized to a single operation
}
uint32_t Coffin_hash(char const *input) {
// Source: https://stackoverflow.com/a/7666668/5407270
uint32_t result = 0x55555555;
while (*input) {
result ^= *input++;
result = _rotl32(result, 5);
}
return result;
}
uint32_t x17(const void * key, int len, uint32_t h)
{
// Source: https://github.com/aappleby/smhasher/blob/master/src/Hashes.cpp
const uint8_t * data = (const uint8_t*)key;
for (int i = 0; i < len; ++i)
{
h = 17 * h + (data[i] - ' ');
}
return h ^ (h >> 16);
}
uint32_t apply_hash(int hash, const char* line)
{
switch (hash) {
case 1: return crc32b((const uint8_t*)line);
case 2: return MurmurOAAT_32(line, SEED);
case 3: return FNV(line, strlen(line), SEED);
case 4: return Jenkins_one_at_a_time_hash(line, strlen(line));
case 5: return DJB2_hash((const uint8_t*)line);
case 6: return KR_v2_hash(line);
case 7: return Coffin_hash(line);
case 8: return x17(line, strlen(line), SEED);
default: break;
}
return 0;
}
int main(int argc, char* argv[])
{
// Read arguments
const int hash_choice = atoi(argv[1]);
char const* const fn = argv[2];
// Read file
FILE* f = fopen(fn, "r");
// Read file line by line, calculate hash
char line[MAXLINE];
while (fgets(line, sizeof(line), f)) {
line[strcspn(line, "\n")] = '\0'; // strip newline
uint32_t hash = apply_hash(hash_choice, line);
printf("%08x\n", hash);
}
fclose(f);
return 0;
}
P.S. A more comprehensive review of speed and quality of modern hash functions can be found in SMHasher repository of Reini Urban (rurban). Notice the "Quality problems" column in the table.