""" Storing actual strings instead of their md5 value appears to be about 10 times faster. >>> md5_speed.run(200,50000) md5 build(len,sec): 50000 0.870999932289 md5 retrv(len,sec): 50000 0.680999994278 std build(len,sec): 50000 0.259999990463 std retrv(len,sec): 50000 0.0599999427795 This test actually takes several minutes to generate the random keys used to populate the dictionaries. Here is a smaller run, but with longer keys. >>> md5_speed.run(1000,4000) md5 build(len,sec,per): 4000 0.129999995232 3.24999988079e-005 md5 retrv(len,sec,per): 4000 0.129999995232 3.24999988079e-005 std build(len,sec,per): 4000 0.0500000715256 1.25000178814e-005 std retrv(len,sec,per): 4000 0.00999999046326 2.49999761581e-006 Results are similar, though not statistically to good because of the short times used and the available clock resolution. Still, I think it is safe to say that, for speed, it is better to store entire strings instead of using md5 versions of their strings. Yeah, the expected result, but it never hurts to check... """ from __future__ import absolute_import, print_function import random import md5 import time import cStringIO def speed(n,m): s = 'a'*n t1 = time.time() for i in range(m): q = md5.new(s).digest() t2 = time.time() print((t2 - t1) / m) #speed(50,1e6) def generate_random(avg_length,count): all_str = [] alphabet = 'abcdefghijklmnopqrstuvwxyz' lo,hi = [30,avg_length*2+30] for i in range(count): new_str = cStringIO.StringIO() l = random.randrange(lo,hi) for i in range(l): new_str.write(random.choice(alphabet)) all_str.append(new_str.getvalue()) return all_str def md5_dict(lst): catalog = {} t1 = time.time() for s in lst: key = md5.new(s).digest() catalog[key] = None t2 = time.time() print('md5 build(len,sec,per):', len(lst), t2 - t1, (t2-t1)/len(lst)) t1 = time.time() for s in lst: key = md5.new(s).digest() val = catalog[key] t2 = time.time() print('md5 retrv(len,sec,per):', len(lst), t2 - t1, (t2-t1)/len(lst)) def std_dict(lst): catalog = {} t1 = time.time() for s in lst: catalog[s] = None t2 = time.time() print('std build(len,sec,per):', len(lst), t2 - t1, (t2-t1)/len(lst)) t1 = time.time() for s in lst: val = catalog[s] t2 = time.time() print('std retrv(len,sec,per):', len(lst), t2 - t1, (t2-t1)/len(lst)) def run(m=200,n=10): lst = generate_random(m,n) md5_dict(lst) std_dict(lst) run(2000,100)