文件格式转换UTF-8

2020-04-10

很多时候,需要把文件格式转换成标准的utf-8,但往往在很多时候很多文件都是ASCII或者gbk2312,这很容易产生乱码!

如果手动转码,无疑是非常浪费时间的事情,因此,我也是因为工作的需要,烦其手工,而网上也无好用的工具,因此,用Python写了一个脚本,实现了文件格式转换UTF-8的需求。

代码如下:

#!/usr/bin/env python
#-*- coding: utf-8 -*-
#modify the file code format
#
##############################################################################

import os
import shutil
import codecs
import chardet
import sys
reload (sys)
sys.setdefaultencoding('utf-8')

#judge the file code format
def check_file(filePath):
    
    f = open(filePath,'rb')
    data = f.read()
    f.close()
    dict_code = chardet.detect(data)
    code_file =  dict_code['encoding']
    print (code_file)
    return code_file

#read file flow as unicode
def read_file(filePath,encoding = "UTF-8"):

    f=codecs.open(filePath,"rb",encoding)
    print 'read:   '+filePath
    data=f.read()
    f.close()
    return data

#specify to write the file format
def write_file(filePath,uuu,encoding = 'UTF-8'):

    f = codecs.open(filePath,'wb',encoding)
    print 'write:   '+filePath
    f.write(uuu)
    f.close()

#if the file format is utf-8 with bom,modify it to no bom.
def utf_nobom(dst):
    f = open(dst,'r')
    data = f.read()
    f.close()
    f = open(dst,'w')
    if data.startswith(codecs.BOM_UTF8):
        data = data[len(codecs.BOM_UTF8):]
    f.write(data)
    f.close()


#copy the file with directory,and search all of the file's with path
def search_file(src,dst):
    file_list = []
    if src is None:
        raise Exception('source_path is None')
    #if dst is not None:
    #    raise Exception('dest_path has existed')
    print src
    print dst
    shutil.copytree(src,dst)
    for dirpath,dirnames,filenames in os.walk(dst):
        for name in filenames:
            file_list.append(dirpath + '\\' + name)

    return file_list



#the main function,call other programs to achive the point of 
#modify the file format to utf-8 with NO BOM.
def other_2_utf8(src,dst):
    print (src)
    print (dst)
    file_list = search_file(src,dst)
    for inode in file_list:
        code_file = check_file(inode)
        data = read_file(inode,code_file)
        write_file(inode,data,'UTF-8')
        utf_nobom(inode)
    else:
        print ('transform is ok')

注意:

该脚本为Python2编写,如果想使用Python3,需要微调,很简单,自行操作。