00001 import sys 00002 ## 00003 # try interpreting s using several possible encodings. 00004 # return value is a three-element tuple. The first element is either an 00005 # ASCII string or a Unicode object. The second element is 1 00006 # if the decoder had to punt and delete some characters from the input 00007 # to successfully generate a Unicode object. 00008 def decode(s, enc=None, denc=sys.getdefaultencoding()): 00009 if isinstance(s, unicode): 00010 return s, 0, "utf-8" 00011 try: 00012 x = unicode(s, "ascii") 00013 # if it's ascii, we're done 00014 return s, 0, "ascii" 00015 except UnicodeError: 00016 encodings = ["utf-8","iso-8859-1","cp1252","iso-8859-15"] 00017 # if the default encoding is not ascii it's a good thing to try 00018 if denc != "ascii": encodings.insert(0, denc) 00019 # always try any caller-provided encoding first 00020 if enc: encodings.insert(0, enc) 00021 for enc in encodings: 00022 00023 # Most of the characters between 0x80 and 0x9F are displayable 00024 # in cp1252 but are control characters in iso-8859-1. Skip 00025 # iso-8859-1 if they are found, even though the unicode() call 00026 # might well succeed. 00027 00028 if (enc in ("iso-8859-15", "iso-8859-1") and 00029 re.search(r"[\x80-\x9f]", s) is not None): 00030 continue 00031 00032 # Characters in the given range are more likely to be 00033 # symbols used in iso-8859-15, so even though unicode() 00034 # may accept such strings with those encodings, skip them. 00035 00036 if (enc in ("iso-8859-1", "cp1252") and 00037 re.search(r"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", s) is not None): 00038 continue 00039 00040 try: 00041 x = unicode(s, enc) 00042 except UnicodeError: 00043 pass 00044 else: 00045 if x.encode(enc) == s: 00046 return x, 0, enc 00047 00048 # nothing worked perfectly - try again, but use the "ignore" parameter 00049 # and return the longest result 00050 output = [(unicode(s, enc, "ignore"), enc) for enc in encodings] 00051 output = [(len(x), x) for x in output] 00052 output.sort() 00053 x, enc = output[-1][1] 00054 return x, 1, enc 00055 00056 00057
© Copyright 2008-2009 Vyper Logix Corp., All Right Reserved; If you reference this document or any part of this document you must use the citation verbatim (including the link) "© Copyright 2008-2009 Vyper Logix Corp., All Right Reserved."
Notice: This source code contained in this document is NOT open source and is NOT being distributed as open source.
122,241 lines of code and growing...