decode.py

Go to the documentation of this file.
00001 import sys
00002 ##
00003 # try interpreting s using several possible encodings.
00004 #     return value is a three-element tuple.  The first element is either an
00005 #     ASCII string or a Unicode object.  The second element is 1
00006 #     if the decoder had to punt and delete some characters from the input
00007 #     to successfully generate a Unicode object.
00008 def decode(s, enc=None, denc=sys.getdefaultencoding()):
00009     if isinstance(s, unicode):
00010         return s, 0, "utf-8"
00011     try:
00012         x = unicode(s, "ascii")
00013         # if it's ascii, we're done
00014         return s, 0, "ascii"
00015     except UnicodeError:
00016         encodings = ["utf-8","iso-8859-1","cp1252","iso-8859-15"]
00017         # if the default encoding is not ascii it's a good thing to try
00018         if denc != "ascii": encodings.insert(0, denc)
00019         # always try any caller-provided encoding first
00020         if enc: encodings.insert(0, enc)
00021         for enc in encodings:
00022 
00023             # Most of the characters between 0x80 and 0x9F are displayable
00024             # in cp1252 but are control characters in iso-8859-1.  Skip
00025             # iso-8859-1 if they are found, even though the unicode() call
00026             # might well succeed.
00027 
00028             if (enc in ("iso-8859-15", "iso-8859-1") and
00029                 re.search(r"[\x80-\x9f]", s) is not None):
00030                 continue
00031 
00032             # Characters in the given range are more likely to be 
00033             # symbols used in iso-8859-15, so even though unicode()
00034             # may accept such strings with those encodings, skip them.
00035 
00036             if (enc in ("iso-8859-1", "cp1252") and
00037                 re.search(r"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", s) is not None):
00038                 continue
00039 
00040             try:
00041                 x = unicode(s, enc)
00042             except UnicodeError:
00043                 pass
00044             else:
00045                 if x.encode(enc) == s:
00046                     return x, 0, enc
00047 
00048         # nothing worked perfectly - try again, but use the "ignore" parameter
00049         # and return the longest result
00050         output = [(unicode(s, enc, "ignore"), enc) for enc in encodings]
00051         output = [(len(x), x) for x in output]
00052         output.sort()
00053         x, enc = output[-1][1]
00054         return x, 1, enc
00055 
00056 
00057 

© Copyright 2008-2009 Vyper Logix Corp., All Right Reserved; If you reference this document or any part of this document you must use the citation verbatim (including the link) "© Copyright 2008-2009 Vyper Logix Corp., All Right Reserved."

Notice: This source code contained in this document is NOT open source and is NOT being distributed as open source.

122,241 lines of code and growing...