|
@@ -1,54 +1,77 @@
|
|
|
-#coding=utf-8
|
|
|
+#!/usr/bin/env python
|
|
|
+# -*- encoding: utf-8 -*-
|
|
|
'''
|
|
|
-unicodecsv,bs4,lxml
|
|
|
-Created on 2017年6月26日
|
|
|
-@vsersion:python3.6
|
|
|
-@author: liuyuqi
|
|
|
+@Contact : liuyuqi.gov@msn.cn
|
|
|
+@Time : 2017年6月26日
|
|
|
+@License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
|
|
|
+@Desc :
|
|
|
'''
|
|
|
-from nt import chdir, listdir
|
|
|
|
|
|
+import os
|
|
|
import unicodecsv as csv
|
|
|
from bs4 import BeautifulSoup
|
|
|
+import pandas as pd
|
|
|
+class GooExport(object):
|
|
|
+ """ docstring for GooExport """
|
|
|
|
|
|
-dataPath="D:\\t"
|
|
|
-resultFile="D:\\result.csv"
|
|
|
-
|
|
|
-
|
|
|
-def do_list_dir():
|
|
|
- for lists in listdir(dataPath):
|
|
|
- saveData(dataPath+"\\"+lists)
|
|
|
-
|
|
|
-def saveData(dataFile):
|
|
|
- #打开dataFile文件
|
|
|
- with open(dataFile, 'r',encoding='utf8', errors = 'replace') as f:
|
|
|
- xml_doc =f.read() #读取xml文本内容
|
|
|
- #去除空格和换行
|
|
|
- xml_doc=xml_doc.replace("\n", "")
|
|
|
- xml_doc=xml_doc.replace(" ", "")
|
|
|
- #xml形式读取
|
|
|
- soup = BeautifulSoup(xml_doc,"xml")
|
|
|
-
|
|
|
- #打开resultFile写文件
|
|
|
- csvfile = open(resultFile, 'ab')# r只读,w可写,a追加 b二进制读写
|
|
|
- writer = csv.writer(csvfile, dialect='excel', encoding='gb18030',errors="ignore")
|
|
|
- needData=soup.findChild("box1")
|
|
|
- for i in range(len(needData.contents)):
|
|
|
- data=[]
|
|
|
- flag=True
|
|
|
- for j in range(len(needData.contents[i].contents)):
|
|
|
- try:
|
|
|
- data.append(needData.contents[i].contents[j].contents[0])
|
|
|
- except Exception as e:
|
|
|
- print(e)
|
|
|
- flag=False
|
|
|
- if flag==True:
|
|
|
- writer.writerow(data)
|
|
|
- csvfile.close()
|
|
|
-
|
|
|
-def main():
|
|
|
- print("start!")
|
|
|
- chdir(dataPath)
|
|
|
- do_list_dir()
|
|
|
- print("finish!")
|
|
|
-
|
|
|
-main()
|
|
|
+ def __init__(self, inputDataPath, outResultFile):
|
|
|
+ self.inputDataPath=inputDataPath
|
|
|
+ self.outResultFile=outResultFile
|
|
|
+ self.job = pd.DataFrame()
|
|
|
+
|
|
|
+ def parseXml(self, dataFile):
|
|
|
+ # print(dataFile,"------------------")
|
|
|
+ with open(dataFile, 'r',encoding='utf8', errors = 'replace') as f:
|
|
|
+ xml_doc =f.read() #读取xml文本内容
|
|
|
+ # 去除空格和换行
|
|
|
+ xml_doc=xml_doc.replace("\n", "")
|
|
|
+ xml_doc=xml_doc.replace(" ", "")
|
|
|
+ # xml 形式读取
|
|
|
+ soup = BeautifulSoup(xml_doc, "lxml-xml")
|
|
|
+
|
|
|
+ needData=soup.findChild("job")
|
|
|
+ for i in range(len(needData.contents)):
|
|
|
+ data=dict()
|
|
|
+ for j in range(len(needData.contents[i].contents)):
|
|
|
+ try:
|
|
|
+ key=needData.contents[i].contents[j].name
|
|
|
+ value=needData.contents[i].contents[j].contents[0]
|
|
|
+ data[key]=value
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|
|
|
+ # 把data加入到job(dataframe)中,其中key是列名,value是数据,可能会缺失字段
|
|
|
+ new_df = pd.DataFrame([data]) # Convert the data to a DataFrame
|
|
|
+ self.job = pd.concat([self.job, new_df], ignore_index=True)
|
|
|
+ self.removeDuplication()
|
|
|
+
|
|
|
+ def removeDuplication(self):
|
|
|
+ # print("removeDuplication")
|
|
|
+ self.job=self.job.drop_duplicates()
|
|
|
+ def run(self):
|
|
|
+ print("start!")
|
|
|
+ os.chdir(self.inputDataPath)
|
|
|
+ for lists in os.listdir(self.inputDataPath):
|
|
|
+ if os.path.isfile(os.path.join(self.inputDataPath, lists)) and lists.endswith(".xml"):
|
|
|
+ self.parseXml(os.path.join(self.inputDataPath, lists))
|
|
|
+ with open(self.outResultFile, 'wb') as f:
|
|
|
+ writer = csv.writer(f, dialect='excel', encoding='gb18030',errors="ignore")
|
|
|
+ writer.writerow(self.job.columns)
|
|
|
+ writer.writerows(self.job.values)
|
|
|
+ print("finish!")
|
|
|
+
|
|
|
+if __name__=='__main__':
|
|
|
+ from argparse import ArgumentParser
|
|
|
+
|
|
|
+ parser = ArgumentParser()
|
|
|
+ parser.add_argument("-i", "--inputDataPath", help="inputDataPath", default=".", type=str)
|
|
|
+ parser.add_argument("-o", "--outResultFile", help="outResultFile", default="/workspace/result.csv", type=str)
|
|
|
+
|
|
|
+ args = parser.parse_args()
|
|
|
+ inputDataPath=args.inputDataPath
|
|
|
+ outResultFile=args.outResultFile
|
|
|
+
|
|
|
+ if inputDataPath=="" or outResultFile=="":
|
|
|
+ print("please input inputDataPath and outResultFile")
|
|
|
+ exit()
|
|
|
+ gooExport = GooExport(inputDataPath, outResultFile)
|
|
|
+ gooExport.run()
|