I found that even modern Python versions (like 3.x) are not able to detect BOM on text files. I would like to know if there is any module that could add this missing feature to Python by replacing the open()
and codecs.open()
functions for reading and writing text files.
views:
63answers:
2
A:
The solution suggested here still seems good to me (here's modified version of that code, still in Python 2, not Python 3, and with a usage example):
#!/usr/bin/python
# -*- coding: utf-8 -*-
import codecs, logging, sys
logging.basicConfig(level=logging.INFO)
bomdict = {
codecs.BOM_UTF8 : 'UTF8',
codecs.BOM_UTF16_BE : 'UTF-16BE',
codecs.BOM_UTF16_LE : 'UTF-16LE' }
def read_unicode(filename):
the_text = open(filename, 'r').read()
for bom, encoding in bomdict.items():
if the_text.startswith(bom):
logging.info('BOM found, using %s', encoding)
the_text = the_text[len(bom):]
break
else:
logging.info('No BOM, using utf8')
encoding = 'UTF8'
return the_text.decode(encoding)
f = open('x.txt', 'wb')
f.write(codecs.BOM_UTF16_LE)
f.write(u'zeé fóo!'.encode('UTF-16LE'))
f.close()
print read_unicode('x.txt')
Alex Martelli
2010-06-14 18:56:48
+1
A:
Here is partially working replacement for file.open(). It does work with Python 2.6 but on Python 3.1 I get an error:
Traceback (most recent call last):
File "unicode-file.py", line 15, in <module>
old_file_write = file.write
NameError: name 'file' is not defined
Unicode friendly file.open() replacement
#!/usr/bin/python
import codecs, sys, types
# we save the file function handler because we want to override it
open_old = open
# on Python 3.x we overwrite write method in order to make it accept bytes in addition to str
old_file_write = file.write
class file():
def write(self, d):
if isinstance(d, types.bytes):
self.buffer.write(d)
else:
old_file_write(d)
def open(filename, mode=None, bufsize=None):
#try:
# we read the first 4 bytes just to be sure we use the right encoding
if(mode == "r"): # we are interested of detecting the mode only for read text
f = open_old(filename, "rb")
aBuf = f.read(4)
if aBuf[:3] == '\xEF\xBB\xBF' :
f = codecs.open(filename, mode, "utf_8")
f.seek(3,0)
elif aBuf[:4] == '\xFF\xFE\x00\x00':
f = codecs.open(filename, mode, "utf_32_le")
f.seek(4,0)
elif aBuf[:4] == '\x00\x00\xFE\xFF':
f = codecs.open(filename, mode, "utf_32_be")
f.seek(4,0)
elif aBuf[:2] == '\xFF\xFE':
f = codecs.open(filename, mode, "utf_16_le")
f.seek(2,0)
elif aBuf[:2] == '\xFE\xFF':
f = codecs.open(filename, mode, "utf_16_be")
f.seek(2,0)
else: # we assume that if there is no BOM, the encoding is UTF-8
f.close()
f = codecs.open(filename, mode, "utf-8")
f.seek(0)
return f
else:
return open_old(filename, mode, bufsize)
# now use the open(file, "r")
Sorin Sbarnea
2010-06-15 13:49:47