Package gluon :: Module decoder
[hide private]
[frames] | no frames]

Source Code for Module gluon.decoder

 1  import codecs 
 2   
 3   
 4  """Caller will hand this library a buffer and ask it to either convert 
 5  it or auto-detect the type. 
 6   
 7  Based on http://code.activestate.com/recipes/52257/ 
 8   
 9  Licensed under the PSF License 
10  """ 
11   
12  # None represents a potentially variable byte. "##" in the XML spec... 
13  autodetect_dict = {  # bytepattern     : ("name", 
14                                           (0x00, 0x00, 0xFE, 0xFF): ("ucs4_be"), 
15      (0xFF, 0xFE, 0x00, 0x00): ("ucs4_le"), 
16      (0xFE, 0xFF, None, None): ("utf_16_be"), 
17      (0xFF, 0xFE, None, None): ("utf_16_le"), 
18      (0x00, 0x3C, 0x00, 0x3F): ("utf_16_be"), 
19      (0x3C, 0x00, 0x3F, 0x00): ("utf_16_le"), 
20      (0x3C, 0x3F, 0x78, 0x6D): ("utf_8"), 
21      (0x4C, 0x6F, 0xA7, 0x94): ("EBCDIC") 
22  } 
23   
24   
25 -def autoDetectXMLEncoding(buffer):
26 """ buffer -> encoding_name 27 The buffer should be at least 4 bytes long. 28 Returns None if encoding cannot be detected. 29 Note that encoding_name might not have an installed 30 decoder (e.g. EBCDIC) 31 """ 32 # a more efficient implementation would not decode the whole 33 # buffer at once but otherwise we'd have to decode a character at 34 # a time looking for the quote character...that's a pain 35 36 encoding = "utf_8" # according to the XML spec, this is the default 37 # this code successively tries to refine the default 38 # whenever it fails to refine, it falls back to 39 # the last place encoding was set. 40 if len(buffer) >= 4: 41 bytes = (byte1, byte2, byte3, byte4) = tuple(map(ord, buffer[0:4])) 42 enc_info = autodetect_dict.get(bytes, None) 43 if not enc_info: # try autodetection again removing potentially 44 # variable bytes 45 bytes = (byte1, byte2, None, None) 46 enc_info = autodetect_dict.get(bytes) 47 else: 48 enc_info = None 49 50 if enc_info: 51 encoding = enc_info # we've got a guess... these are 52 #the new defaults 53 54 # try to find a more precise encoding using xml declaration 55 secret_decoder_ring = codecs.lookup(encoding)[1] 56 (decoded, length) = secret_decoder_ring(buffer) 57 first_line = decoded.split("\n")[0] 58 if first_line and first_line.startswith(u"<?xml"): 59 encoding_pos = first_line.find(u"encoding") 60 if encoding_pos != -1: 61 # look for double quote 62 quote_pos = first_line.find('"', encoding_pos) 63 64 if quote_pos == -1: # look for single quote 65 quote_pos = first_line.find("'", encoding_pos) 66 67 if quote_pos > -1: 68 quote_char, rest = (first_line[quote_pos], 69 first_line[quote_pos + 1:]) 70 encoding = rest[:rest.find(quote_char)] 71 72 return encoding
73 74
75 -def decoder(buffer):
76 encoding = autoDetectXMLEncoding(buffer) 77 return buffer.decode(encoding).encode('utf8')
78