soundex.py

Go to the documentation of this file.
00001 ##
00002 # 
00003 # 
00004 # The soundex algorithm takes an English word, and returns an easily-computed
00005 # hash of it; this hash is intended to be the same for words that sound alike.
00006 # This module provides an interface to the soundex algorithm.
00007 # 
00008 # Note that the soundex algorithm is quite simple-minded, and isn't perfect by
00009 # any measure.  Its main purpose is to help looking up names in databases,
00010 # when the name may be misspelled -- soundex tends to group common
00011 # misspellings together.
00012 # 
00013 # Note that though the functions this module exports have the same names as
00014 # their counterparts in the original soundex module that was distributed with
00015 # Python, the algorithm is not quite the same.  In particular, the original
00016 # algorithm failed to notice doubled first letters (e.g., 'Lloyd').
00017 # 
00018 # You can vary the length of the soundex string calculated by setting the
00019 # module global, NDIGITS.  If you set NDIGITS to 0, it will compute a soundex
00020 # string whose length varies with a length of the input string.
00021 # 
00022 # 
00023 
00024 # Released to the public domain, by Skip Montanaro, 21 December 2000.
00025 
00026 # This module represents a melding of two different soundex modules
00027 # written by Tim Peters and Fred Drake.
00028 
00029 NDIGITS = 3
00030 
00031 # B for Break -- all characters assumed simply to break runs
00032 _tran = ["B"] * 256
00033 def _setcode(letters, value):
00034     for ch in letters:
00035         _tran[ord(ch.upper())] = _tran[ord(ch)] = value
00036 _setcode("bfpv", "1")
00037 _setcode("cgjkqsxz", "2")
00038 _setcode("dt", "3")
00039 _setcode("l", "4")
00040 _setcode("mn", "5")
00041 _setcode("r", "6")
00042 # B for Break -- these guys break runs
00043 _setcode("aeiouy", "B")
00044 # I for Invisible -- they don't count for anything except as first char
00045 _setcode("hw", "I")
00046 assert len(filter(lambda ch: ch != "B", _tran)) == \
00047        (26 - len("aeiouy")) * 2, \
00048        "Soundex initialization screwed up"
00049 _tran = "".join(_tran)
00050 del _setcode
00051 
00052 ##
00053 # name -> Soundex code, following Knuth Vol 3 Ed 2 pg 394.
00054 # 
00055 #     The number of digits generated can be modified by setting the
00056 #     module-level variable, NDIGITS (default 3).
00057 #     
00058 def get_soundex(name):
00059 
00060     if not name:
00061         raise ValueError("soundex requires non-empty name argument")
00062     coded = name.translate(_tran)
00063     out = [name[0].upper()]
00064     lastrealcode = coded[0]
00065     ignore_same = 1
00066     for thiscode in coded[1:]:
00067         if thiscode == "B":
00068             ignore_same = 0
00069             continue
00070         if thiscode == "I":
00071             continue
00072         if ignore_same and lastrealcode == thiscode:
00073             continue
00074         out.append(thiscode)
00075         lastrealcode = thiscode
00076         ignore_same = 1
00077         if NDIGITS and len(out) > NDIGITS:
00078             break
00079     if len(out) < NDIGITS + 1:
00080         out.append("0" * (NDIGITS + 1 - len(out)))
00081     return "".join(out)
00082 
00083 ##
00084 # Return the soundex hash value for a word.
00085 # 
00086 #     It will always be a 6-character string.  `name' must contain the word to
00087 #     be hashed, with no leading whitespace; the case of the word is ignored.
00088 #     This algorithm strives to be compatible with the algorithm in the old
00089 #     soundex.c module.
00090 # 
00091 #     
00092 def get_soundex_orig(name):
00093 
00094     s = name.upper()
00095     if not s:
00096         return '000000'
00097     r = [s[0]]
00098     s = s[1:]
00099     while len(r) < 6 and s:
00100         c = s[0]
00101         s = s[1:]
00102         if c in "WHAIOUY":
00103             pass
00104         elif c in "BFPV":
00105             if r[-1] != '1':
00106                 r.append('1')
00107         elif c in "CGJKQSXZ":
00108             if r[-1] != '2':
00109                 r.append('2')
00110         elif c in "DT":
00111             if r[-1] != '3':
00112                 r.append('3')
00113         elif c == "L":
00114             if r[-1] != '4':
00115                 r.append('4')
00116         elif c in "MN":
00117             if r[-1] != '5':
00118                 r.append('5')
00119         elif c == "R":
00120             if r[-1] != '6':
00121                 r.append('6')
00122     r.append('0' * (6 - len(r)))
00123     return "".join(r)
00124 
00125 ##
00126 # Returns true if both arguments have the same soundex code.
00127 def sound_similar(s1, s2):
00128     return get_soundex(s1) == get_soundex(s2)
00129 
00130 def _test():
00131     global nerrors, NDIGITS
00132     def check(name, expected):
00133         global nerrors
00134         got = get_soundex(name)
00135         if got != expected:
00136             nerrors = nerrors + 1
00137             print "error in Soundex test: name", name, \
00138                   "expected", expected, "got", got
00139     nerrors = 0
00140     NDIGITS = 3
00141     check("Euler", "E460")
00142     check("Ellery", "E460")
00143     check("guass", "G200")
00144     check("gauss", "G200")
00145     check("Ghosh", "G200")
00146     check("HILBERT", "H416")
00147     check("Heilbronn", "H416")
00148     check("Knuth", "K530")
00149     check("K ** n  U123t9247h   ", "K530")
00150     check("Kant", "K530")
00151     check("Lloyd", "L300")
00152     check("Liddy", "L300")
00153     check("Lukasiewicz", "L222")
00154     check("Lissajous", "L222")
00155     check("Wachs", "W200")
00156     check("Waugh", "W200")
00157     check("HYHYH", "H000")
00158     check("kkkkkkkwwwwkkkkkhhhhhkkkkemmnmnhmn", "K500")
00159     check("Rogers", "R262")
00160     check("Rodgers", "R326")
00161     check("Sinclair", "S524")
00162     check("St. Clair", "S324")
00163     check("Tchebysheff", "T212")
00164     check("Chebyshev", "C121")
00165     check("Bib", "B100")
00166     check("Bilbo", "B410")
00167     check("Bibby", "B100")
00168     check("Bibbster", "B123")
00169 
00170     if nerrors:
00171         raise SystemError("soundex test failed with " + `nerrors` +
00172                           " errors")
00173     else:
00174         print "all tests passed"
00175         
00176 if __name__ == "__main__":
00177     _test()
00178 
00179 

© Copyright 2008-2009 Vyper Logix Corp., All Right Reserved; If you reference this document or any part of this document you must use the citation verbatim (including the link) "© Copyright 2008-2009 Vyper Logix Corp., All Right Reserved."

Notice: This source code contained in this document is NOT open source and is NOT being distributed as open source.

122,241 lines of code and growing...