68 lines
1.7 KiB
Plaintext
68 lines
1.7 KiB
Plaintext
From: jeremy at cnri.reston.va.us (Jeremy Hylton)
|
|
Date: Thu, 15 Apr 1999 22:39:11 GMT
|
|
Subject: for in benchmark interested
|
|
In-Reply-To: <14102.20958.460408.832042@buffalo.fnal.gov>
|
|
References: <37157CE7.EFB470CA@inka.de>
|
|
<14102.15523.573321.443195@bitdiddle.cnri.reston.va.us>
|
|
<14102.20958.460408.832042@buffalo.fnal.gov>
|
|
Message-ID: <14102.20968.308910.844571@bitdiddle.cnri.reston.va.us>
|
|
Content-Length: 1237
|
|
X-UID: 1461
|
|
|
|
Doh!
|
|
|
|
I guess you could read it all at once, which would be fine for a file
|
|
that's only 6MB or so. If you wanted correctness (how important is
|
|
that in a benchmark anyway?) and still want to read fixed-size chunks,
|
|
then you need to see if the buffer that is read ends in the middle of
|
|
a word or between words. If you add that checking, the code is a bit
|
|
more complex but still about 20% faster.
|
|
|
|
#!/usr/local/bin/python
|
|
import sys
|
|
import string
|
|
|
|
|
|
def run():
|
|
dict={}
|
|
dict_get = dict.get
|
|
read = sys.stdin.read
|
|
string_split = string.split
|
|
prev = ''
|
|
while 1:
|
|
buf = read(500000)
|
|
if buf:
|
|
parts = string_split(buf)
|
|
|
|
# did buffer start with whitespace?
|
|
if buf[0] == parts[0][0]:
|
|
parts[0] = prev + parts[0]
|
|
elif prev:
|
|
dict[prev] = dict_get(prev, 0) + 1
|
|
|
|
for key in parts[:-1]:
|
|
dict[key] = dict_get(key, 0) + 1
|
|
|
|
# buffer end with whitespace?
|
|
if buf[-1] != parts[-1][-1]:
|
|
key = parts[-1]
|
|
dict[key] = dict_get(key, 0) + 1
|
|
prev = ''
|
|
else:
|
|
prev = parts[-1]
|
|
else:
|
|
return dict
|
|
|
|
|
|
dict = run()
|
|
write = sys.stdout.write
|
|
for word in dict.keys():
|
|
write("%4d\t%s\n" % (dict[word], word))
|
|
|
|
|
|
Jeremy
|
|
|
|
|
|
|
|
|