@@ -1,54 +1,77 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
-Created on 2017年6月26日
-@author: liuyuqi
+@Contact : liuyuqi.gov@msn.cn
+@Time : 2017年6月26日
+@License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc :
-from nt import chdir, listdir
+import os
import unicodecsv as csv
from bs4 import BeautifulSoup
+import pandas as pd
+class GooExport(object):
+ """ docstring for GooExport """
-def do_list_dir():
- for lists in listdir(dataPath):
- saveData(dataPath+"\\"+lists)
-def saveData(dataFile):
- #打开dataFile文件
- with open(dataFile, 'r',encoding='utf8', errors = 'replace') as f:
- xml_doc =f.read() #读取xml文本内容
- #去除空格和换行
- xml_doc=xml_doc.replace("\n", "")
- xml_doc=xml_doc.replace(" ", "")
- #xml形式读取
- soup = BeautifulSoup(xml_doc,"xml")
- #打开resultFile写文件
- csvfile = open(resultFile, 'ab')# r只读,w可写,a追加 b二进制读写
- writer = csv.writer(csvfile, dialect='excel', encoding='gb18030',errors="ignore")
- needData=soup.findChild("box1")
- for i in range(len(needData.contents)):
- data=[]
- flag=True
- for j in range(len(needData.contents[i].contents)):
- try:
- data.append(needData.contents[i].contents[j].contents[0])
- except Exception as e:
- print(e)
- flag=False
- if flag==True:
- writer.writerow(data)
- csvfile.close()
-def main():
- print("start!")
- chdir(dataPath)
- do_list_dir()
- print("finish!")
+ def __init__(self, inputDataPath, outResultFile):
+ self.inputDataPath=inputDataPath
+ self.outResultFile=outResultFile
+ self.job = pd.DataFrame()
+ def parseXml(self, dataFile):
+ # print(dataFile,"------------------")
+ with open(dataFile, 'r',encoding='utf8', errors = 'replace') as f:
+ xml_doc =f.read() #读取xml文本内容
+ # 去除空格和换行
+ xml_doc=xml_doc.replace("\n", "")
+ xml_doc=xml_doc.replace(" ", "")
+ # xml 形式读取
+ soup = BeautifulSoup(xml_doc, "lxml-xml")
+ needData=soup.findChild("job")
+ for i in range(len(needData.contents)):
+ data=dict()
+ for j in range(len(needData.contents[i].contents)):
+ try:
+ key=needData.contents[i].contents[j].name
+ value=needData.contents[i].contents[j].contents[0]
+ data[key]=value
+ except Exception as e:
+ print(e)
+ # 把data加入到job(dataframe)中,其中key是列名,value是数据,可能会缺失字段
+ new_df = pd.DataFrame([data]) # Convert the data to a DataFrame
+ self.job = pd.concat([self.job, new_df], ignore_index=True)
+ self.removeDuplication()
+ def removeDuplication(self):
+ # print("removeDuplication")
+ self.job=self.job.drop_duplicates()
+ def run(self):
+ print("start!")
+ os.chdir(self.inputDataPath)
+ for lists in os.listdir(self.inputDataPath):
+ if os.path.isfile(os.path.join(self.inputDataPath, lists)) and lists.endswith(".xml"):
+ self.parseXml(os.path.join(self.inputDataPath, lists))
+ with open(self.outResultFile, 'wb') as f:
+ writer = csv.writer(f, dialect='excel', encoding='gb18030',errors="ignore")
+ writer.writerow(self.job.columns)
+ writer.writerows(self.job.values)
+ print("finish!")
+if __name__=='__main__':
+ from argparse import ArgumentParser
+ parser = ArgumentParser()
+ parser.add_argument("-i", "--inputDataPath", help="inputDataPath", default=".", type=str)
+ parser.add_argument("-o", "--outResultFile", help="outResultFile", default="/workspace/result.csv", type=str)
+ args = parser.parse_args()
+ inputDataPath=args.inputDataPath
+ outResultFile=args.outResultFile
+ if inputDataPath=="" or outResultFile=="":
+ print("please input inputDataPath and outResultFile")
+ exit()
+ gooExport = GooExport(inputDataPath, outResultFile)
+ gooExport.run()