Как можно ещё ускорить выполнение такого скрипта:
#!/usr/bin/env python
import sys, difflib, re, os
from pathlib import Path
def regexp_compile (r):
return re.compile(r, re.IGNORECASE)
def words_generator (buff):
length = len(buff)
start = 0
while start < length:
x = buff.find(b'\x00', start)
yield buff[start:x].decode('utf-8')
start = x + 9 # 1 byte for \0 + 4 bytes * 2
## 32 bit for word index and 32 bit for word length
class RegexAccumulator:
def __init__ (self, regexp):
self.collection = list()
f = lambda s: re.fullmatch(regexp, s) and s not in self.collection
self.filter = f
def accumulate (self, words):
self.collection += list(filter(self.filter, words))
def result (self):
self.collection.reverse()
return self.collection
class UsualAccumulator:
def __init__ (self, word):
self.word = word
self.list1 = list()
self.mlist = list()
w = re.escape(word)
regex1 = regexp_compile(".*" + w)
self.regex2 = regexp_compile(w)
self.rfilter = lambda s: re.match(regex1, s) and s not in self.list1
self.mfilter = lambda s: s not in self.list1 and s not in self.mlist
def accumulate (self, words):
self.list1 += list(filter(self.rfilter, words))
mlist = difflib.get_close_matches(self.word,
words,
n=400,
cutoff=0.7)
self.mlist += list(filter(self.mfilter, mlist))
def result (self):
result = list()
list2 = list(filter(lambda s: re.match(self.regex2, s),
self.list1))
if self.word in list2:
result.append(word)
list2 = list(filter(lambda s: s != word, list2))
result += list2
result += list(filter(lambda s: s not in result,
self.list1))
result += list(filter(lambda s: s not in result,
self.mlist))
result.reverse()
return result
if __name__ == "__main__":
word = sys.argv[1]
if word == "-r" and len(sys.argv) > 2:
accumulator = RegexAccumulator(sys.argv[2])
else:
accumulator = UsualAccumulator(word)
files = Path(os.environ['STARDICT_DATA_DIR']).rglob("*.[iI][dD][xX]")
for name in files:
with open(name, 'rb') as f:
## My biggest .idx file ~ 11Mb so ...
buff = f.read()
accumulator.accumulate(list(words_generator(buff)))
for word in accumulator.result():
print(word)
Скрипт ищет слова в stardict словарях http://stardict-4.sourceforge.net/StarDictFileFormat секция {3}.
Очевидно что бутылочное горлышко в методах accumulate. Ещё может вместо words_generator и метода result у UsualAccumulator можно что-то более производительное приделать.