-
Notifications
You must be signed in to change notification settings - Fork 1
/
spellcheck.py
executable file
·136 lines (108 loc) · 3.59 KB
/
spellcheck.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
spellchecker
Usage:
./spellcheck.py
Requirements:
Python 2.7
Uses /usr/share/dict/words for word list
@author: kristi
"""
import re
from collections import defaultdict
from cmd import Cmd
VOWEL = set("aeiouy")
VOWEL_RE = re.compile(r'[aeiouy]')
REPEAT_RE = re.compile(r'(.)\1+')
def toPattern(word):
def toUpper(matchobj):
return matchobj.group(1).upper()
pattern = word
# Vowels get replaced with 'A'
pattern = VOWEL_RE.sub('A', pattern)
# Repeated letters are replaced with a single letter
pattern = REPEAT_RE.sub(r'\1', pattern)
return pattern
def tokenize(word):
"""
Split word by repeated character. Vowels get lumped together.
>>> tokenize('oreooo')
['o', 'r', 'eooo']
"""
return [m.group(0) for m in re.finditer(r'[aeiouy]+|(.)\1*', word)]
def pick_suggestion(word, candidates):
"""
Pick best candidate word using a simple heuristic score
Alternate scoring methodologies include:
Python's get_close_matches string similarity via SequenceMatcher's
Ratcliff-Obershelp based algorithm
difflib.get_close_matches(word, candidates, n=1, cutoff=0.0)[0]
http://docs.python.org/library/difflib.html#difflib.SequenceMatcher
http://hg.python.org/cpython/file/70274d53c1dd/Lib/difflib.py
Levenshtein distance or edit distance
http://en.wikipedia.org/wiki/Levenshtein_distance
Using a hidden markov model for each candidate
http://en.wikipedia.org/wiki/Viterbi_algorithm
http://en.wikipedia.org/wiki/Forward-backward_algorithm
"""
word_tokens = tokenize(word)
suggestion = None
best_score = 0
for candidate in candidates:
candidate_tokens = tokenize(candidate)
score = 1.0
for w, c in zip(word_tokens, candidate_tokens):
set_w = set(w)
set_c = set(c)
# Don't choose words with fewer letters
if len(w) < len(c):
score = 0
break
if w == c:
score += len(c)
else:
# penalize changing vowels
factor = (len(set_w & set_c) + 1) / (len(set_c) + 1.0)
# small penalization for repeating letters
factor *= 10 / (len(w) - len(c) + 10.0)
score += factor * 0.8
if score > best_score:
best_score = score
suggestion = candidate
return suggestion
class SpellCmd(Cmd):
def __init__(self, wordfile):
Cmd.__init__(self)
self.prompt = "> "
self.intro = "Spell checker thing. Enter some words."
self.wordfile = wordfile
self.words = dict.fromkeys(
(line.strip().lower() for line in open(wordfile)), 1)
self.approx = defaultdict(set)
for word in self.words.iterkeys():
pattern = toPattern(word)
self.approx[pattern].add(word)
def emptyline(self):
pass
def default(self, line):
if line == "EOF":
exit(0)
word = line.strip().lower()
patt = toPattern(word)
if word in self.words:
print word
elif patt in self.approx:
candidates = self.approx[patt]
suggestion = pick_suggestion(word, candidates)
if suggestion:
print suggestion
else:
print "NO SUGGESTION"
else:
print "NO SUGGESTION"
if __name__ == "__main__":
#wordfile = "words.txt"
wordfile = "/usr/share/dict/words"
sc = SpellCmd(wordfile)
sc.cmdloop()