Repository : http://git.fedorahosted.org/git/?p=ibus-typing-booster.git
On branch : miketmp-debug
>---------------------------------------------------------------
commit f17968e455ae6edbcc97dfb5fac27f082bc8256c
Author: Mike FABIAN <mfabian(a)redhat.com>
Date: Fri Oct 25 15:14:26 2013 +0200
WIP: use multiple dictionaries
>---------------------------------------------------------------
ibus-typing-booster/engine/hunspell_suggest.py | 157 ++++++++++++------------
ibus-typing-booster/engine/tabsqlitedb.py | 2 +-
2 files changed, 79 insertions(+), 80 deletions(-)
diff --git a/ibus-typing-booster/engine/hunspell_suggest.py b/ibus-typing-booster/engine/hunspell_suggest.py
index 2602f02..1d97bb1 100755
--- a/ibus-typing-booster/engine/hunspell_suggest.py
+++ b/ibus-typing-booster/engine/hunspell_suggest.py
@@ -37,72 +37,85 @@ except:
max_words = 100
max_words_row = 50
-class Hunspell:
- def __init__(self,loc='/usr/share/myspell/',dict_name='en_US'):
- self.normalization_form_internal = 'NFD'
- self.loc = loc
- self.dict_name = dict_name + '.dic'
- self.aff_name = dict_name + '.aff'
+normalization_form_internal = 'NFD'
+
+class Dictionary:
+ def __init__(self, name=u'en_US'):
+ self.loc = '/usr/share/myspell'
+ self.name = name
self.encoding = 'UTF-8'
- self.dict_buffer = None
- self.aff_buffer = None
+ self.buffer = None
+ self.pyhunspell_object = None
self.load_dictionary()
def load_dictionary(self):
- self.encoding = 'UTF-8'
- self.dict_buffer = None
- self.aff_buffer = None
- self.pyhunspell_object = None
print "load_dictionary() ..."
- if not os.path.isfile(self.loc+self.dict_name) or not os.path.isfile(self.loc+self.aff_name):
- print "load_dictionary(): .dic or .aff file missing."
+ dic_path = os.path.join(self.loc, self.name+'.dic')
+ aff_path = os.path.join(self.loc, self.name+'.aff')
+ if not os.path.isfile(dic_path) or not os.path.isfile(aff_path):
+ print("load_dictionary %(n)s: %(d)s %(a)s file missing."
+ %{'n': self.name, 'd': dic_path, 'a': aff_path})
return
try:
- self.aff_buffer = open(
- self.loc+self.aff_name).read().replace('\r\n', '\n')
+ aff_buffer = open(aff_path).read().replace('\r\n', '\n')
except:
import traceback
traceback.print_exc()
- if self.aff_buffer:
+ if aff_buffer:
encoding_pattern = re.compile(
r'^[\s]*SET[\s]+(?P<encoding>[-a-zA-Z0-9_]+)[\s]*$',
re.MULTILINE|re.UNICODE)
- match = encoding_pattern.search(self.aff_buffer)
+ match = encoding_pattern.search(aff_buffer)
if match:
self.encoding = match.group('encoding')
print "load_dictionary(): encoding=%(enc)s found in %(aff)s" %{
- 'enc': self.encoding, 'aff': self.loc+self.aff_name}
+ 'enc': self.encoding, 'aff': aff_path}
try:
- self.dict_buffer = codecs.open(
- self.loc+self.dict_name).read().decode(self.encoding).replace('\r\n', '\n')
+ self.buffer = codecs.open(
+ dic_path).read().decode(self.encoding).replace('\r\n', '\n')
except:
print "load_dictionary(): loading %(dic)s as %(enc)s encoding failed, fall back to ISO-8859-1." %{
- 'dic': self.loc+self.dict_name, 'enc': self.encoding}
+ 'dic': dic_path, 'enc': self.encoding}
self.encoding = 'ISO-8859-1'
try:
- self.dict_buffer = codecs.open(
- self.loc+self.dict_name).read().decode(self.encoding).replace('\r\n', '\n')
+ self.buffer = codecs.open(
+ dic_path).read().decode(self.encoding).replace('\r\n', '\n')
except:
- print "load_dictionary(): loading %(dic)s as %(enc)s encoding failed, giving up." %{
- 'dic': self.loc+self.dict_name, 'enc': self.encoding}
- self.dict_buffer = None
- self.aff_buffer = None
+ print("load_dictionary(): loading %(dic)s as %(enc)s encoding failed, giving up." %{
+ 'dic': dic_path, 'enc': self.encoding})
+ self.buffer = None
import traceback
traceback.print_exc()
- if self.dict_buffer:
- self.dict_buffer = unicodedata.normalize(
- self.normalization_form_internal, self.dict_buffer)
- if import_hunspell_successful:
- self.pyhunspell_object = hunspell.HunSpell(
- self.loc+self.dict_name,
- self.loc+self.aff_name)
- else:
- self.pyhunspell_object = None
+ return
+ if self.buffer:
+ self.buffer = unicodedata.normalize(
+ normalization_form_internal, self.buffer)
+ if import_hunspell_successful:
+ self.pyhunspell_object = hunspell.HunSpell(
+ dic_path, aff_path)
+ else:
+ self.pyhunspell_object = None
+
+class Hunspell:
+ def __init__(self, dictionary_names=['en_US']):
+ self.dictionaries = []
+ print("mike dictionary_names=%s\n" %dictionary_names)
+ for dictionary_name in dictionary_names:
+ self.dictionaries.append(Dictionary(name=dictionary_name))
- def words_start(self,word):
- sys.stderr.write("mike in words_start word=%s\n" %word.encode('UTF-8'))
- if type(word) != type(u''):
- word = word.decode('utf8')
+ def suggest(self, input_phrase):
+ # If the input phrase is very long, don’t try looking
+ # something up in the hunspell dictionaries. The regexp match
+ # gets very slow if the input phrase is very long. And there
+ # are no very long words in the hunspell dictionaries anyway,
+ # the longest word in the German hunspell dictionary currently
+ # seems to be “Geschwindigkeitsübertretungsverfahren” trying
+ # to match words longer than that just wastes time.
+ if len(input_phrase) > 40:
+ return []
+ if type(input_phrase) != type(u''):
+ input_phrase = input_phrase.decode('utf8')
+ sys.stderr.write("mike in suggest input_phrase=%(ip)s\n" %{'ip': input_phrase.encode('UTF-8')})
# http://pwet.fr/man/linux/fichiers_speciaux/hunspell says:
#
# > A dictionary file (*.dic) contains a list of words, one per
@@ -114,51 +127,37 @@ class Hunspell:
# I.e. if '/' is already contained in the input, it cannot
# match a word in the dictionary and we return an empty list
# immediately:
- if '/' in word:
+ if '/' in input_phrase:
return []
# And we should not match further than '/'.
# Take care to use a non-greedy regexp to match only
# one line and not accidentally big chunks of the file!
try:
- regexp = r'^'+re.escape(word)+r'.*?(?=/|$)'
+ regexp = r'^'+re.escape(input_phrase)+r'.*?(?=/|$)'
patt_start = re.compile(regexp,re.MULTILINE|re.UNICODE)
except:
import traceback
traceback.print_exc()
- if self.dict_buffer != None:
- start_words = patt_start.findall(self.dict_buffer)
- if self.pyhunspell_object != None:
- if len(word) >= 4:
- # Always pass NFC to pyhunspell and convert the
- # result back to NFKD, even for Korean (For
- # Korean, hunspell does a NFC -> NFKD conversion
- # of the input and NFKD->NFC conversion of the
- # output)
- word = unicodedata.normalize('NFC', word)
- extra_suggestions = map(
- lambda x: unicodedata.normalize(
- self.normalization_form_internal, x.decode(self.encoding)),
- self.pyhunspell_object.suggest(word.encode(self.encoding, 'replace')))
- for suggestion in extra_suggestions:
- if suggestion not in start_words:
- start_words.append(suggestion)
- else:
- start_words = [u'☹ %(loc)s%(dict_name)s not found.' %{'loc': self.loc, 'dict_name': self.dict_name}, u'☹ please install hunspell dictionary!']
-# sys.stderr.write("mike words=%(w)s\n" %{'w': list(set(start_words[0:max_words]))})
- return list(set(start_words[0:max_words]))
-
- def suggest(self, input_phrase):
- # If the input phrase is very long, don’t try looking
- # something up in the hunspell dictionaries. The regexp match
- # gets very slow if the input phrase is very long. And there
- # are no very long words in the hunspell dictionaries anyway,
- # the longest word in the German hunspell dictionary currently
- # seems to be “Geschwindigkeitsübertretungsverfahren” trying
- # to match words longer than that just wastes time.
- if len(input_phrase) > 40:
- return []
-# sys.stderr.write("mike in suggest word=%(ip)s\n" %{'ip': input_phrase.encode('UTF-8')})
- return self.words_start(input_phrase)
-
-
+ suggested_words = []
+ for dictionary in self.dictionaries:
+ if dictionary.buffer:
+ suggested_words += patt_start.findall(dictionary.buffer)
+ if dictionary.pyhunspell_object:
+ if len(input_phrase) >= 4:
+ # Always pass NFC to pyhunspell and convert the
+ # result back to the internal normalization form (NFD)
+ # (hunspell does the right thing for Korean if the input is NFC).
+ input_phrase = unicodedata.normalize('NFC', input_phrase)
+ extra_suggestions = map(
+ lambda x: unicodedata.normalize(
+ normalization_form_internal, x.decode(dictionary.encoding)),
+ dictionary.pyhunspell_object.suggest(input_phrase.encode(dictionary.encoding, 'replace')))
+ for suggestion in extra_suggestions:
+ if suggestion not in suggested_words:
+ suggested_words.append(suggestion)
+ else:
+ dic_path = os.path.join(dictionary.loc, dictionary.name+'.dic')
+ suggested_words.insert(
+ 0, u'☹ %(dic_path)s not found. Please install hunspell dictionary!' %{'dic_path': dic_path})
+ return suggested_words[0:max_words]
diff --git a/ibus-typing-booster/engine/tabsqlitedb.py b/ibus-typing-booster/engine/tabsqlitedb.py
index 17e0ee8..a156027 100755
--- a/ibus-typing-booster/engine/tabsqlitedb.py
+++ b/ibus-typing-booster/engine/tabsqlitedb.py
@@ -90,7 +90,7 @@ class tabsqlitedb:
self._normalization_form_internal = 'NFD'
self.hunspell_obj = hunspell_suggest.Hunspell(
- dict_name=self.ime_properties.get("hunspell_dict").replace('.dic', ''))
+ dictionary_names=[self.ime_properties.get("hunspell_dict").replace('.dic', '')])
#user_db = self.ime_properties.get("name")+'-user.db'
user_db = 'user.db'