1 import codecs
2
3
4 """Caller will hand this library a buffer and ask it to either convert
5 it or auto-detect the type.
6
7 Based on http://code.activestate.com/recipes/52257/
8
9 Licensed under the PSF License
10 """
11
12
13 autodetect_dict = {
14 (0x00, 0x00, 0xFE, 0xFF): ("ucs4_be"),
15 (0xFF, 0xFE, 0x00, 0x00): ("ucs4_le"),
16 (0xFE, 0xFF, None, None): ("utf_16_be"),
17 (0xFF, 0xFE, None, None): ("utf_16_le"),
18 (0x00, 0x3C, 0x00, 0x3F): ("utf_16_be"),
19 (0x3C, 0x00, 0x3F, 0x00): ("utf_16_le"),
20 (0x3C, 0x3F, 0x78, 0x6D): ("utf_8"),
21 (0x4C, 0x6F, 0xA7, 0x94): ("EBCDIC")
22 }
23
24
26 """ buffer -> encoding_name
27 The buffer should be at least 4 bytes long.
28 Returns None if encoding cannot be detected.
29 Note that encoding_name might not have an installed
30 decoder (e.g. EBCDIC)
31 """
32
33
34
35
36 encoding = "utf_8"
37
38
39
40 if len(buffer) >= 4:
41 bytes = (byte1, byte2, byte3, byte4) = tuple(map(ord, buffer[0:4]))
42 enc_info = autodetect_dict.get(bytes, None)
43 if not enc_info:
44
45 bytes = (byte1, byte2, None, None)
46 enc_info = autodetect_dict.get(bytes)
47 else:
48 enc_info = None
49
50 if enc_info:
51 encoding = enc_info
52
53
54
55 secret_decoder_ring = codecs.lookup(encoding)[1]
56 (decoded, length) = secret_decoder_ring(buffer)
57 first_line = decoded.split("\n")[0]
58 if first_line and first_line.startswith(u"<?xml"):
59 encoding_pos = first_line.find(u"encoding")
60 if encoding_pos != -1:
61
62 quote_pos = first_line.find('"', encoding_pos)
63
64 if quote_pos == -1:
65 quote_pos = first_line.find("'", encoding_pos)
66
67 if quote_pos > -1:
68 quote_char, rest = (first_line[quote_pos],
69 first_line[quote_pos + 1:])
70 encoding = rest[:rest.find(quote_char)]
71
72 return encoding
73
74
78