2 months ago · 4668d91ff9
--- a/.gitignore
+++ b/.gitignore
@@ -58,3 +58,4 @@ docs/_build/
 
				 # PyBuilder
			
 
				 target/
			
 
				 
			
 
				+data/*.xml
			
--- a/README.md
+++ b/README.md
@@ -15,3 +15,21 @@ resultFile=xx
 
				  爬虫数据放到 data 目录中。
			
 
				 
			
 
				 4. python main.py
			
 
				+
			
 
				+```
			
 
				+virtualenv .venv
			
 
				+source .venv/bin/activate
			
 
				+
			
 
				+pip install pyinstaller
			
 
				+
			
 
				+pip install -r requirements.txt
			
 
				+
			
 
				+python main.py -i /workspace/gooExport/data
			
 
				+
			
 
				+pyinstaller --onefile main.py
			
 
				+```
			
 
				+
			
 
				+
			
 
				+## License
			
 
				+
			
 
				+## References
			
--- a/main.py
+++ b/main.py
@@ -1,54 +1,77 @@
 
				-#coding=utf-8
			
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				 '''
			
 
				-unicodecsv,bs4,lxml
			
 
				-Created on 2017年6月26日
			
 
				-@vsersion:python3.6
			
 
				-@author: liuyuqi
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2017年6月26日
			
 
				+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
			
 
				+@Desc    :   
			
 
				 '''
			
 
				-from nt import chdir, listdir
			
 
				 
			
 
				+import os
			
 
				 import unicodecsv as csv
			
 
				 from bs4 import BeautifulSoup
			
 
				+import pandas as pd
			
 
				+class GooExport(object):
			
 
				+    """ docstring for GooExport """
			
 
				 
			
 
				-dataPath="D:\\t"
			
 
				-resultFile="D:\\result.csv"
			
 
				-
			
 
				-  
			
 
				-def do_list_dir():
			
 
				-    for lists in listdir(dataPath):
			
 
				-        saveData(dataPath+"\\"+lists)
			
 
				-
			
 
				-def saveData(dataFile):
			
 
				-    #打开dataFile文件
			
 
				-    with open(dataFile, 'r',encoding='utf8', errors = 'replace') as f:
			
 
				-        xml_doc =f.read()   #读取xml文本内容
			
 
				-    #去除空格和换行
			
 
				-    xml_doc=xml_doc.replace("\n", "")
			
 
				-    xml_doc=xml_doc.replace(" ", "")
			
 
				-    #xml形式读取
			
 
				-    soup = BeautifulSoup(xml_doc,"xml")
			
 
				-    
			
 
				-    #打开resultFile写文件
			
 
				-    csvfile = open(resultFile, 'ab')# r只读，w可写，a追加 b二进制读写
			
 
				-    writer = csv.writer(csvfile, dialect='excel', encoding='gb18030',errors="ignore")
			
 
				-    needData=soup.findChild("box1")
			
 
				-    for i in range(len(needData.contents)):
			
 
				-        data=[]
			
 
				-        flag=True
			
 
				-        for j in range(len(needData.contents[i].contents)):
			
 
				-            try:
			
 
				-                data.append(needData.contents[i].contents[j].contents[0])
			
 
				-            except Exception as e:
			
 
				-                print(e)
			
 
				-                flag=False
			
 
				-        if flag==True:
			
 
				-            writer.writerow(data)    
			
 
				-    csvfile.close()    
			
 
				-            
			
 
				-def main():
			
 
				-    print("start!")
			
 
				-    chdir(dataPath)
			
 
				-    do_list_dir()
			
 
				-    print("finish!")
			
 
				-
			
 
				-main()
			
 
				+    def __init__(self, inputDataPath, outResultFile):
			
 
				+        self.inputDataPath=inputDataPath
			
 
				+        self.outResultFile=outResultFile
			
 
				+        self.job = pd.DataFrame()
			
 
				+
			
 
				+    def parseXml(self, dataFile):
			
 
				+        # print(dataFile,"------------------")
			
 
				+        with open(dataFile, 'r',encoding='utf8', errors = 'replace') as f:
			
 
				+            xml_doc =f.read()   #读取xml文本内容
			
 
				+        # 去除空格和换行
			
 
				+        xml_doc=xml_doc.replace("\n", "")
			
 
				+        xml_doc=xml_doc.replace(" ", "")
			
 
				+        # xml 形式读取
			
 
				+        soup = BeautifulSoup(xml_doc, "lxml-xml")
			
 
				+        
			
 
				+        needData=soup.findChild("job")
			
 
				+        for i in range(len(needData.contents)):
			
 
				+            data=dict()
			
 
				+            for j in range(len(needData.contents[i].contents)):
			
 
				+                try:
			
 
				+                    key=needData.contents[i].contents[j].name
			
 
				+                    value=needData.contents[i].contents[j].contents[0]
			
 
				+                    data[key]=value
			
 
				+                except Exception as e:
			
 
				+                    print(e)
			
 
				+            # 把data加入到job(dataframe)中,其中key是列名,value是数据,可能会缺失字段
			
 
				+            new_df = pd.DataFrame([data])  # Convert the data to a DataFrame
			
 
				+            self.job = pd.concat([self.job, new_df], ignore_index=True)
			
 
				+        self.removeDuplication()
			
 
				+
			
 
				+    def removeDuplication(self):
			
 
				+        # print("removeDuplication")
			
 
				+        self.job=self.job.drop_duplicates()
			
 
				+    def run(self):
			
 
				+        print("start!")
			
 
				+        os.chdir(self.inputDataPath)
			
 
				+        for lists in os.listdir(self.inputDataPath):
			
 
				+            if os.path.isfile(os.path.join(self.inputDataPath, lists)) and lists.endswith(".xml"):
			
 
				+                self.parseXml(os.path.join(self.inputDataPath, lists))
			
 
				+        with open(self.outResultFile, 'wb') as f:
			
 
				+            writer = csv.writer(f, dialect='excel', encoding='gb18030',errors="ignore")
			
 
				+            writer.writerow(self.job.columns)
			
 
				+            writer.writerows(self.job.values)
			
 
				+        print("finish!")
			
 
				+
			
 
				+if __name__=='__main__':
			
 
				+    from argparse import ArgumentParser
			
 
				+
			
 
				+    parser = ArgumentParser()
			
 
				+    parser.add_argument("-i", "--inputDataPath", help="inputDataPath", default=".", type=str)
			
 
				+    parser.add_argument("-o", "--outResultFile", help="outResultFile", default="/workspace/result.csv", type=str)
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+    inputDataPath=args.inputDataPath
			
 
				+    outResultFile=args.outResultFile
			
 
				+
			
 
				+    if inputDataPath=="" or outResultFile=="":
			
 
				+        print("please input inputDataPath and outResultFile")
			
 
				+        exit()
			
 
				+    gooExport = GooExport(inputDataPath, outResultFile)
			
 
				+    gooExport.run()
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 
				-beautifulsoup4==4.5.3
			
 
				+beautifulsoup4==4.12.0
			
 
				 unicodecsv==0.14.1
			
 
				-lxml
			
 
				+lxml
			
 
				+pandas
			
--- a/start_gooexport.bat
+++ b/start_gooexport.bat
@@ -0,0 +1,26 @@
 
				+@echo off
			
 
				+REM ***************************************************************************
			
 
				+REM @Contact :   liuyuqi.gov@msn.cn
			
 
				+REM @Time    :   2024/12/19 02:55:35
			
 
				+REM @Version :   1.0
			
 
				+REM @License :   (C)Copyright 2019 liuyuqi.
			
 
				+REM @Desc    :   提示用户输入i 和 o ，执行命令 python main.py -i %1 -o %2
			
 
				+REM @File    :   start_gooexport.bat
			
 
				+REM @Author  :   liuyuqi
			
 
				+REM @History :
			
 
				+REM %1 - ext_name
			
 
				+REM %2 - characters replaced
			
 
				+REM %3 - new characters
			
 
				+REM ***************************************************************************
			
 
				+
			
 
				+input = input("请输入i和o，执行命令 python main.py -i %1 -o %2")
			
 
				+output = input("请输入i和o，执行命令 python main.py -i %1 -o %2")
			
 
				+    
			
 
				+python main.py -i %1 -o %2
			
 
				+
			
 
				+pause
			
 
				+
			
 
				+exit
			
 
				+
			
 
				+
			
 
				+