Source code for bi_etl.utility.line_counter

#!/usr/bin/env python
import argparse
import mmap
import subprocess
from collections import defaultdict
from timeit import default_timer as timer


[docs] def mapcount(filename): f = open(filename, "r+") buf = mmap.mmap(f.fileno(), 0) lines = 0 readline = buf.readline while readline(): lines += 1 return lines
[docs] def simplecount(filename): lines = 0 for _ in open(filename): lines += 1 return lines
# Fastest for small files
[docs] def bufcount(filename): f = open(filename) lines = 0 buf_size = 1024 * 1024 read_f = f.read # loop optimization buf = read_f(buf_size) while buf: lines += buf.count('\n') buf = read_f(buf_size) return lines
# Fastest for big files
[docs] def wccount(filename): out = subprocess.Popen(['wc', '-l', filename], stdout=subprocess.PIPE, stderr=subprocess.STDOUT ).communicate()[0] return int(out.partition(b' ')[0])
[docs] def itercount(filename): return sum(1 for _ in open(filename, 'rbU'))
[docs] def opcount(fname): line_number = 0 with open(fname) as f: for line_number, _ in enumerate(f, 1): pass return line_number
[docs] def kylecount(fname): return sum(1 for _ in open(fname))
[docs] def clear_cache(): """Clear disk cache on Linux.""" # os.system("sync ; sudo /bin/sh -c 'echo 3 > /proc/sys/vm/drop_caches'") pass
[docs] def main(): parser = argparse.ArgumentParser() parser.add_argument('---clear-cache', action="store_true") parser.add_argument('-n', type=int, default=3) parser.add_argument('filename') args = parser.parse_args() counts = defaultdict() if args.clear_cache: do_clear_cache = True else: do_clear_cache = False filename = args.filename for _ in range(args.n): for func in (f for n, f in list(globals().items()) if n.endswith('count') and hasattr(f, '__call__')): if do_clear_cache: clear_cache() start_time = timer() # http://norvig.com/big.txt if filename == 'big.txt': assert func(filename) == 128457 # 128457 1095695 6488666 big.txt else: func(filename) counts[func].append(timer() - start_time) timings = {} for key, vals in list(counts.items()): timings[key.__name__] = sum(vals) / float(len(vals)), min(vals) width = max(len(n) for n in timings) + 1 print(("%s %s %s %s" % ( "function".ljust(width), "average, s".rjust(15), "min, s".rjust(15), "ratio".rjust(15)))) absmin_ = min(x[1] for x in list(timings.values())) for name, (av, min_) in sorted(list(timings.items()), key=lambda x: x[1][1]): print(f"{name.ljust(width)} {av:15.4f} {min_:15.4f} {min_ / absmin_:15.4f}")
if __name__ == '__main__': main() # function average, s min, s ratio # wccount 0.005 0.0042 1.00 # bufcount 0.0081 0.0081 1.91 # fadvcount 0.0094 0.0091 2.13 # opcount 0.018 0.015 3.42 # simplecount 0.019 0.016 3.66 # kylecount 0.019 0.017 4.03 # mapcount 0.027 0.021 4.97 # itercount 0.044 0.031 7.21 # python3.1 ginstrom.py # function average, s min, s ratio # wccount 0.0049 0.0046 1.00 # itercount 0.021 0.02 4.47 # mapcount 0.023 0.023 5.09 # bufcount 0.034 0.032 7.02 # opcount 0.043 0.043 9.46 # simplecount 0.05 0.046 10.20 # kylecount 0.05 0.05 10.95 # python ginstrom.py /big/mkv/file # function average, s min, s ratio # wccount 0.51 0.49 1.00 # opcount 1.8 1.8 3.58 # simplecount 1.8 1.8 3.66 # kylecount 1.9 1.9 3.75 # mapcount 19 2 4.01 # fadvcount 2.3 2.2 4.52 # bufcount 2.3 2.2 4.52 # wc /big/mkv/file # 7137518 40523351 1836139137 /big/mkv/file # with --clear-cache # function average, s min, s ratio # simplecount 0.06 0.057 1.00 # opcount 0.067 0.057 1.00 # kylecount 0.057 0.057 1.00 # itercount 0.06 0.058 1.02 # mapcount 0.059 0.058 1.02 # fadvcount 0.064 0.058 1.02 # bufcount 0.07 0.062 1.09 # wccount 0.072 0.065 1.15 # python3.1 with --clear-cache # function average, s min, s ratio # itercount 0.061 0.057 1.00 # simplecount 0.069 0.061 1.06 # mapcount 0.062 0.061 1.07 # wccount 0.067 0.064 1.11 # kylecount 0.067 0.065 1.12 # opcount 0.072 0.067 1.17 # bufcount 0.083 0.073 1.27