00001 ## 00002 # 00003 # 00004 # The soundex algorithm takes an English word, and returns an easily-computed 00005 # hash of it; this hash is intended to be the same for words that sound alike. 00006 # This module provides an interface to the soundex algorithm. 00007 # 00008 # Note that the soundex algorithm is quite simple-minded, and isn't perfect by 00009 # any measure. Its main purpose is to help looking up names in databases, 00010 # when the name may be misspelled -- soundex tends to group common 00011 # misspellings together. 00012 # 00013 # Note that though the functions this module exports have the same names as 00014 # their counterparts in the original soundex module that was distributed with 00015 # Python, the algorithm is not quite the same. In particular, the original 00016 # algorithm failed to notice doubled first letters (e.g., 'Lloyd'). 00017 # 00018 # You can vary the length of the soundex string calculated by setting the 00019 # module global, NDIGITS. If you set NDIGITS to 0, it will compute a soundex 00020 # string whose length varies with a length of the input string. 00021 # 00022 # 00023 00024 # Released to the public domain, by Skip Montanaro, 21 December 2000. 00025 00026 # This module represents a melding of two different soundex modules 00027 # written by Tim Peters and Fred Drake. 00028 00029 NDIGITS = 3 00030 00031 # B for Break -- all characters assumed simply to break runs 00032 _tran = ["B"] * 256 00033 def _setcode(letters, value): 00034 for ch in letters: 00035 _tran[ord(ch.upper())] = _tran[ord(ch)] = value 00036 _setcode("bfpv", "1") 00037 _setcode("cgjkqsxz", "2") 00038 _setcode("dt", "3") 00039 _setcode("l", "4") 00040 _setcode("mn", "5") 00041 _setcode("r", "6") 00042 # B for Break -- these guys break runs 00043 _setcode("aeiouy", "B") 00044 # I for Invisible -- they don't count for anything except as first char 00045 _setcode("hw", "I") 00046 assert len(filter(lambda ch: ch != "B", _tran)) == \ 00047 (26 - len("aeiouy")) * 2, \ 00048 "Soundex initialization screwed up" 00049 _tran = "".join(_tran) 00050 del _setcode 00051 00052 ## 00053 # name -> Soundex code, following Knuth Vol 3 Ed 2 pg 394. 00054 # 00055 # The number of digits generated can be modified by setting the 00056 # module-level variable, NDIGITS (default 3). 00057 # 00058 def get_soundex(name): 00059 00060 if not name: 00061 raise ValueError("soundex requires non-empty name argument") 00062 coded = name.translate(_tran) 00063 out = [name[0].upper()] 00064 lastrealcode = coded[0] 00065 ignore_same = 1 00066 for thiscode in coded[1:]: 00067 if thiscode == "B": 00068 ignore_same = 0 00069 continue 00070 if thiscode == "I": 00071 continue 00072 if ignore_same and lastrealcode == thiscode: 00073 continue 00074 out.append(thiscode) 00075 lastrealcode = thiscode 00076 ignore_same = 1 00077 if NDIGITS and len(out) > NDIGITS: 00078 break 00079 if len(out) < NDIGITS + 1: 00080 out.append("0" * (NDIGITS + 1 - len(out))) 00081 return "".join(out) 00082 00083 ## 00084 # Return the soundex hash value for a word. 00085 # 00086 # It will always be a 6-character string. `name' must contain the word to 00087 # be hashed, with no leading whitespace; the case of the word is ignored. 00088 # This algorithm strives to be compatible with the algorithm in the old 00089 # soundex.c module. 00090 # 00091 # 00092 def get_soundex_orig(name): 00093 00094 s = name.upper() 00095 if not s: 00096 return '000000' 00097 r = [s[0]] 00098 s = s[1:] 00099 while len(r) < 6 and s: 00100 c = s[0] 00101 s = s[1:] 00102 if c in "WHAIOUY": 00103 pass 00104 elif c in "BFPV": 00105 if r[-1] != '1': 00106 r.append('1') 00107 elif c in "CGJKQSXZ": 00108 if r[-1] != '2': 00109 r.append('2') 00110 elif c in "DT": 00111 if r[-1] != '3': 00112 r.append('3') 00113 elif c == "L": 00114 if r[-1] != '4': 00115 r.append('4') 00116 elif c in "MN": 00117 if r[-1] != '5': 00118 r.append('5') 00119 elif c == "R": 00120 if r[-1] != '6': 00121 r.append('6') 00122 r.append('0' * (6 - len(r))) 00123 return "".join(r) 00124 00125 ## 00126 # Returns true if both arguments have the same soundex code. 00127 def sound_similar(s1, s2): 00128 return get_soundex(s1) == get_soundex(s2) 00129 00130 def _test(): 00131 global nerrors, NDIGITS 00132 def check(name, expected): 00133 global nerrors 00134 got = get_soundex(name) 00135 if got != expected: 00136 nerrors = nerrors + 1 00137 print "error in Soundex test: name", name, \ 00138 "expected", expected, "got", got 00139 nerrors = 0 00140 NDIGITS = 3 00141 check("Euler", "E460") 00142 check("Ellery", "E460") 00143 check("guass", "G200") 00144 check("gauss", "G200") 00145 check("Ghosh", "G200") 00146 check("HILBERT", "H416") 00147 check("Heilbronn", "H416") 00148 check("Knuth", "K530") 00149 check("K ** n U123t9247h ", "K530") 00150 check("Kant", "K530") 00151 check("Lloyd", "L300") 00152 check("Liddy", "L300") 00153 check("Lukasiewicz", "L222") 00154 check("Lissajous", "L222") 00155 check("Wachs", "W200") 00156 check("Waugh", "W200") 00157 check("HYHYH", "H000") 00158 check("kkkkkkkwwwwkkkkkhhhhhkkkkemmnmnhmn", "K500") 00159 check("Rogers", "R262") 00160 check("Rodgers", "R326") 00161 check("Sinclair", "S524") 00162 check("St. Clair", "S324") 00163 check("Tchebysheff", "T212") 00164 check("Chebyshev", "C121") 00165 check("Bib", "B100") 00166 check("Bilbo", "B410") 00167 check("Bibby", "B100") 00168 check("Bibbster", "B123") 00169 00170 if nerrors: 00171 raise SystemError("soundex test failed with " + `nerrors` + 00172 " errors") 00173 else: 00174 print "all tests passed" 00175 00176 if __name__ == "__main__": 00177 _test() 00178 00179
© Copyright 2008-2009 Vyper Logix Corp., All Right Reserved; If you reference this document or any part of this document you must use the citation verbatim (including the link) "© Copyright 2008-2009 Vyper Logix Corp., All Right Reserved."
Notice: This source code contained in this document is NOT open source and is NOT being distributed as open source.
122,241 lines of code and growing...