1 month ago · e211c241cc
--- a/README.md
+++ b/README.md
@@ -1,3 +1,207 @@
 
				 # hadoop-tools
			
 
				 
			
 
				-hadoop相关操作类
			
 
				+Hadoop 相关操作类，提供 Hadoop 数据分析能力。
			
 
				+
			
 
				+## 项目概述
			
 
				+
			
 
				+本项目提供两种语言的实现：
			
 
				+- **Java 版本**：原始实现，位于 `src/` 目录
			
 
				+- **Python 版本**：新实现，位于 `python/` 目录（推荐使用）
			
 
				+
			
 
				+## Python 版本功能
			
 
				+
			
 
				+### 1. HDFS 文件系统操作 (`hdfs_operations.py`)
			
 
				+
			
 
				+提供与 HDFS 交互的各种方法：
			
 
				+
			
 
				+| 方法 | 功能描述 |
			
 
				+|------|----------|
			
 
				+| `make_dir(path)` | 创建目录 |
			
 
				+| `delete(path, recursive)` | 删除文件/目录 |
			
 
				+| `copy_from_local(src, dst)` | 从本地上传文件到 HDFS |
			
 
				+| `copy_to_local(src, dst)` | 从 HDFS 下载文件到本地 |
			
 
				+| `read_file(path)` | 读取 HDFS 文件内容 |
			
 
				+| `write_file(path, content)` | 写入内容到 HDFS 文件 |
			
 
				+| `exists(path)` | 检查路径是否存在 |
			
 
				+| `list_dir(path)` | 列出目录内容 |
			
 
				+| `get_file_size(path)` | 获取文件大小 |
			
 
				+
			
 
				+**使用示例：**
			
 
				+```python
			
 
				+from python.hdfs_operations import HDFSOperations
			
 
				+
			
 
				+hdfs = HDFSOperations()
			
 
				+
			
 
				+# 创建目录
			
 
				+hdfs.make_dir('/user/hadoop/data')
			
 
				+
			
 
				+# 上传文件
			
 
				+hdfs.copy_from_local('/local/path/file.txt', '/user/hadoop/data/')
			
 
				+
			
 
				+# 读取文件内容
			
 
				+content = hdfs.read_file('/user/hadoop/data/file.txt')
			
 
				+
			
 
				+# 检查文件是否存在
			
 
				+if hdfs.exists('/user/hadoop/data/file.txt'):
			
 
				+    print("文件存在")
			
 
				+```
			
 
				+
			
 
				+### 2. Hadoop Streaming 词频统计 (`mapreduce/wordcount_streaming.py`)
			
 
				+
			
 
				+使用 Hadoop Streaming 方式实现经典的 WordCount 算法：
			
 
				+
			
 
				+**核心组件：**
			
 
				+- **Mapper**: 将文本分割为单词，输出 `<单词, 1>`
			
 
				+- **Combiner**: 在 Mapper 端进行本地合并，减少数据传输
			
 
				+- **Reducer**: 统计每个单词的总次数
			
 
				+
			
 
				+**使用方式：**
			
 
				+
			
 
				+1. **作为模块导入：**
			
 
				+```python
			
 
				+from python.mapreduce.wordcount_streaming import WordCountStreaming
			
 
				+
			
 
				+wc = WordCountStreaming()
			
 
				+
			
 
				+# 本地统计（用于测试）
			
 
				+result = wc.count_words_locally("Hello world, hello Hadoop!")
			
 
				+print(result)  # {'hello': 2, 'world': 1, 'hadoop': 1}
			
 
				+
			
 
				+# 提交 Hadoop Streaming 作业
			
 
				+wc.run('/user/hadoop/input', '/user/hadoop/output')
			
 
				+```
			
 
				+
			
 
				+2. **作为独立脚本运行：**
			
 
				+```bash
			
 
				+# 运行 Mapper
			
 
				+echo "Hello world hello" | python wordcount_streaming.py mapper
			
 
				+
			
 
				+# 运行 Reducer
			
 
				+echo "hello\t1\nworld\t1\nhello\t1" | sort | python wordcount_streaming.py reducer
			
 
				+
			
 
				+# 本地测试
			
 
				+python wordcount_streaming.py local
			
 
				+```
			
 
				+
			
 
				+### 3. PySpark 词频统计 (`mapreduce/wordcount_spark.py`)
			
 
				+
			
 
				+使用 PySpark 实现词频统计，这是现代大数据处理的推荐方式：
			
 
				+
			
 
				+**特性：**
			
 
				+- 更简洁的 API
			
 
				+- 更好的性能
			
 
				+- 支持 RDD 和 DataFrame 两种 API
			
 
				+- 可以与 Spark SQL、MLlib 等集成
			
 
				+
			
 
				+**使用方式：**
			
 
				+
			
 
				+1. **作为模块导入：**
			
 
				+```python
			
 
				+from python.mapreduce.wordcount_spark import WordCountSpark
			
 
				+
			
 
				+wc = WordCountSpark()
			
 
				+
			
 
				+# 本地统计（不使用 Spark 集群）
			
 
				+result = wc.count_words_locally("Spark is fast and general engine")
			
 
				+
			
 
				+# 使用 Spark 运行（支持本地文件和 HDFS）
			
 
				+result = wc.run('/user/hadoop/data/input.txt', '/user/hadoop/output')
			
 
				+
			
 
				+# 停止 Spark 会话
			
 
				+wc.stop()
			
 
				+```
			
 
				+
			
 
				+2. **作为独立脚本运行：**
			
 
				+```bash
			
 
				+# 本地模式（不使用 Spark 集群）
			
 
				+python wordcount_spark.py --local input.txt output.txt
			
 
				+
			
 
				+# Spark 模式
			
 
				+python wordcount_spark.py hdfs:///user/hadoop/data output
			
 
				+```
			
 
				+
			
 
				+## 工具函数 (`utils/helpers.py`)
			
 
				+
			
 
				+提供常用的辅助功能：
			
 
				+
			
 
				+| 函数 | 功能描述 |
			
 
				+|------|----------|
			
 
				+| `run_command(cmd)` | 执行命令行命令 |
			
 
				+| `validate_hdfs_path(path)` | 验证 HDFS 路径格式 |
			
 
				+| `format_file_size(size_bytes)` | 格式化文件大小（B/KB/MB/GB/TB） |
			
 
				+| `setup_logger(name)` | 设置日志器 |
			
 
				+
			
 
				+## 安装依赖
			
 
				+
			
 
				+```bash
			
 
				+# 安装基础依赖
			
 
				+pip install -r requirements.txt
			
 
				+
			
 
				+# 安装 PySpark（如果需要使用 Spark 功能）
			
 
				+pip install pyspark
			
 
				+```
			
 
				+
			
 
				+## 运行示例
			
 
				+
			
 
				+```bash
			
 
				+# 运行完整示例
			
 
				+python examples/run_examples.py
			
 
				+
			
 
				+# 测试 Hadoop Streaming 词频统计
			
 
				+python python/mapreduce/wordcount_streaming.py local
			
 
				+
			
 
				+# 测试 PySpark 词频统计（本地模式）
			
 
				+python python/mapreduce/wordcount_spark.py --local examples/sample_data.txt
			
 
				+```
			
 
				+
			
 
				+## 项目结构
			
 
				+
			
 
				+```
			
 
				+hadoop-tools/
			
 
				+├── python/                    # Python 版本实现
			
 
				+│   ├── __init__.py           # 模块入口
			
 
				+│   ├── hdfs_operations.py    # HDFS 操作功能
			
 
				+│   ├── mapreduce/
			
 
				+│   │   ├── __init__.py
			
 
				+│   │   ├── wordcount_streaming.py  # Hadoop Streaming 方式
			
 
				+│   │   └── wordcount_spark.py      # PySpark 方式
			
 
				+│   └── utils/
			
 
				+│       ├── __init__.py
			
 
				+│       └── helpers.py         # 工具函数
			
 
				+├── examples/                  # 示例文件
			
 
				+│   ├── sample_data.txt       # 示例数据
			
 
				+│   └── run_examples.py       # 示例运行脚本
			
 
				+├── src/                       # Java 版本实现
			
 
				+│   └── me/yoqi/hadoop/test/
			
 
				+│       ├── CommonOperation.java  # HDFS 操作
			
 
				+│       └── WordCount.java        # 词频统计
			
 
				+├── R/                         # R 语言相关（待实现）
			
 
				+├── pom.xml                   # Maven 配置（Java 项目）
			
 
				+├── requirements.txt          # Python 依赖
			
 
				+└── README.md
			
 
				+```
			
 
				+
			
 
				+## 与 Java 版本的对应关系
			
 
				+
			
 
				+| Java 类/方法 | Python 对应 |
			
 
				+|--------------|-------------|
			
 
				+| `CommonOperation.makeDir()` | `HDFSOperations.make_dir()` |
			
 
				+| `CommonOperation.delDir()` | `HDFSOperations.delete(recursive=True)` |
			
 
				+| `CommonOperation.delFile()` | `HDFSOperations.delete(recursive=False)` |
			
 
				+| `CommonOperation.putFile()` | `HDFSOperations.copy_from_local()` |
			
 
				+| `CommonOperation.readFile()` | `HDFSOperations.read_file()` |
			
 
				+| `CommonOperation.writeFile()` | `HDFSOperations.write_file()` |
			
 
				+| `WordCount.TokenizerMapper` | `WordCountStreaming.mapper()` |
			
 
				+| `WordCount.IntSumReducer` | `WordCountStreaming.reducer()` |
			
 
				+| `WordCount.main()` | `WordCountStreaming.run()` / `WordCountSpark.run()` |
			
 
				+
			
 
				+## 注意事项
			
 
				+
			
 
				+1. **HDFS 操作**：需要配置 Hadoop 环境变量，确保 `hdfs` 或 `hadoop` 命令可用
			
 
				+2. **Hadoop Streaming**：需要找到 Hadoop Streaming jar 文件，脚本会自动查找常见位置
			
 
				+3. **PySpark**：需要安装 PySpark 并配置 Spark 环境
			
 
				+4. **路径格式**：HDFS 路径必须以 `/` 开头，不能包含连续斜杠或非法字符
			
 
				+
			
 
				+## 许可证
			
 
				+
			
 
				+详见 LICENSE 文件。
			
--- a/examples/run_examples.py
+++ b/examples/run_examples.py
@@ -0,0 +1,216 @@
 
				+"""
			
 
				+Hadoop Tools 示例脚本
			
 
				+
			
 
				+演示如何使用 Python 版本的 Hadoop 工具包。
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+import os
			
 
				+
			
 
				+# 添加项目路径
			
 
				+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
			
 
				+
			
 
				+from python.hdfs_operations import HDFSOperations
			
 
				+from python.mapreduce.wordcount_streaming import WordCountStreaming
			
 
				+from python.mapreduce.wordcount_spark import WordCountSpark
			
 
				+from python.utils.helpers import format_file_size
			
 
				+
			
 
				+
			
 
				+def example_hdfs_operations():
			
 
				+    """
			
 
				+    示例：HDFS 操作
			
 
				+    """
			
 
				+    print("\n" + "=" * 60)
			
 
				+    print("示例 1: HDFS 操作 (HDFSOperations)")
			
 
				+    print("=" * 60)
			
 
				+    
			
 
				+    hdfs = HDFSOperations()
			
 
				+    
			
 
				+    # 由于我们可能没有实际的 Hadoop 环境，这里只演示 API 的使用
			
 
				+    # 实际使用时需要配置 Hadoop 环境
			
 
				+    
			
 
				+    print("\n1. 验证 HDFS 路径格式:")
			
 
				+    test_paths = [
			
 
				+        '/user/hadoop/data',
			
 
				+        'invalid/path',
			
 
				+        '/user//hadoop',
			
 
				+        '/user/hadoop/data/file.txt'
			
 
				+    ]
			
 
				+    
			
 
				+    for path in test_paths:
			
 
				+        is_valid = hdfs._validate_hdfs_path if hasattr(hdfs, '_validate_hdfs_path') else hdfs.exists  # 实际使用 exists 方法
			
 
				+        # 这里使用本地验证方法
			
 
				+        from python.utils.helpers import validate_hdfs_path
			
 
				+        valid = validate_hdfs_path(path)
			
 
				+        print(f"   路径 '{path}': {'有效' if valid else '无效'}")
			
 
				+    
			
 
				+    print("\n2. 格式化文件大小:")
			
 
				+    sizes = [1024, 1024*1024, 1024*1024*1024, 1024*1024*1024*1024]
			
 
				+    for size in sizes:
			
 
				+        print(f"   {size} 字节 = {format_file_size(size)}")
			
 
				+    
			
 
				+    print("\n3. HDFS 操作方法列表:")
			
 
				+    methods = [
			
 
				+        ('make_dir', '创建目录'),
			
 
				+        ('delete', '删除文件/目录'),
			
 
				+        ('copy_from_local', '从本地上传文件到 HDFS'),
			
 
				+        ('copy_to_local', '从 HDFS 下载文件到本地'),
			
 
				+        ('read_file', '读取 HDFS 文件内容'),
			
 
				+        ('write_file', '写入内容到 HDFS 文件'),
			
 
				+        ('exists', '检查路径是否存在'),
			
 
				+        ('list_dir', '列出目录内容'),
			
 
				+        ('get_file_size', '获取文件大小')
			
 
				+    ]
			
 
				+    
			
 
				+    for method, desc in methods:
			
 
				+        print(f"   - {method}(): {desc}")
			
 
				+    
			
 
				+    print("\n注意：实际运行 HDFS 操作需要配置 Hadoop 环境变量。")
			
 
				+
			
 
				+
			
 
				+def example_wordcount_streaming():
			
 
				+    """
			
 
				+    示例：Hadoop Streaming 词频统计
			
 
				+    """
			
 
				+    print("\n" + "=" * 60)
			
 
				+    print("示例 2: Hadoop Streaming 词频统计 (WordCountStreaming)")
			
 
				+    print("=" * 60)
			
 
				+    
			
 
				+    wc = WordCountStreaming()
			
 
				+    
			
 
				+    print("\n1. Mapper 功能演示:")
			
 
				+    test_line = "Hello world, hello Hadoop! Hadoop is great."
			
 
				+    print(f"   输入行: '{test_line}'")
			
 
				+    mapper_output = wc.mapper(test_line)
			
 
				+    print(f"   Mapper 输出: {mapper_output}")
			
 
				+    
			
 
				+    print("\n2. Combiner 功能演示:")
			
 
				+    print(f"   输入: {mapper_output}")
			
 
				+    combiner_output = wc.combiner(mapper_output)
			
 
				+    print(f"   Combiner 输出: {combiner_output}")
			
 
				+    
			
 
				+    print("\n3. Reducer 功能演示:")
			
 
				+    test_word = 'hadoop'
			
 
				+    test_counts = [1, 1, 1, 1]
			
 
				+    print(f"   单词: '{test_word}', 计数列表: {test_counts}")
			
 
				+    reducer_output = wc.reducer(test_word, test_counts)
			
 
				+    print(f"   Reducer 输出: {reducer_output}")
			
 
				+    
			
 
				+    print("\n4. 本地词频统计演示:")
			
 
				+    sample_text = """
			
 
				+    Hadoop is a framework for distributed storage and processing of big data.
			
 
				+    Big data is data that contains greater variety.
			
 
				+    Hadoop provides massive storage for any kind of data.
			
 
				+    """
			
 
				+    print(f"   输入文本:")
			
 
				+    for line in sample_text.strip().split('\n'):
			
 
				+        print(f"      {line.strip()}")
			
 
				+    
			
 
				+    result = wc.count_words_locally(sample_text)
			
 
				+    
			
 
				+    print("\n   统计结果 (按词频排序):")
			
 
				+    sorted_result = sorted(result.items(), key=lambda x: x[1], reverse=True)
			
 
				+    for word, count in sorted_result[:10]:
			
 
				+        print(f"      {word}: {count}")
			
 
				+
			
 
				+
			
 
				+def example_wordcount_spark():
			
 
				+    """
			
 
				+    示例：PySpark 词频统计
			
 
				+    """
			
 
				+    print("\n" + "=" * 60)
			
 
				+    print("示例 3: PySpark 词频统计 (WordCountSpark)")
			
 
				+    print("=" * 60)
			
 
				+    
			
 
				+    wc = WordCountSpark()
			
 
				+    
			
 
				+    print("\n1. 本地词频统计演示 (不使用 Spark 集群):")
			
 
				+    
			
 
				+    # 使用示例数据文件
			
 
				+    sample_file = os.path.join(os.path.dirname(__file__), 'sample_data.txt')
			
 
				+    
			
 
				+    if os.path.exists(sample_file):
			
 
				+        print(f"   输入文件: {sample_file}")
			
 
				+        
			
 
				+        # 读取文件内容
			
 
				+        with open(sample_file, 'r', encoding='utf-8') as f:
			
 
				+            content = f.read()
			
 
				+        
			
 
				+        # 本地统计
			
 
				+        result = wc.count_words_locally(content)
			
 
				+        
			
 
				+        print("\n   统计结果 (Top 15):")
			
 
				+        sorted_result = sorted(result.items(), key=lambda x: x[1], reverse=True)
			
 
				+        
			
 
				+        total_words = sum(result.values())
			
 
				+        print(f"   总词数: {total_words}")
			
 
				+        print(f"   不同词数: {len(result)}")
			
 
				+        print("   -" * 30)
			
 
				+        
			
 
				+        for i, (word, count) in enumerate(sorted_result[:15], 1):
			
 
				+            percentage = (count / total_words) * 100
			
 
				+            print(f"   {i:2d}. {word:12s} {count:3d} ({percentage:4.1f}%)")
			
 
				+    else:
			
 
				+        print(f"   示例文件不存在: {sample_file}")
			
 
				+        # 使用内置文本
			
 
				+        sample_text = """
			
 
				+        Spark is a fast and general engine for large-scale data processing.
			
 
				+        Spark provides high-level APIs in Java, Scala, Python and R.
			
 
				+        Spark also supports a rich set of higher-level tools including Spark SQL.
			
 
				+        """
			
 
				+        result = wc.count_words_locally(sample_text)
			
 
				+        
			
 
				+        print("\n   统计结果:")
			
 
				+        for word, count in sorted(result.items(), key=lambda x: x[1], reverse=True):
			
 
				+            print(f"   {word}: {count}")
			
 
				+    
			
 
				+    print("\n2. WordCountSpark 方法列表:")
			
 
				+    methods = [
			
 
				+        ('run', '运行完整的词频统计作业（支持本地文件和 HDFS）'),
			
 
				+        ('count_words_from_rdd', '使用 RDD API 统计单词'),
			
 
				+        ('count_words_from_dataframe', '使用 DataFrame API 统计单词'),
			
 
				+        ('count_words_locally', '本地统计单词（不使用 Spark 集群）'),
			
 
				+        ('run_with_files', '对多个文件运行词频统计'),
			
 
				+        ('stop', '停止 Spark 会话')
			
 
				+    ]
			
 
				+    
			
 
				+    for method, desc in methods:
			
 
				+        print(f"   - {method}(): {desc}")
			
 
				+    
			
 
				+    print("\n注意：使用 Spark 功能需要安装 PySpark: pip install pyspark")
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """
			
 
				+    主函数：运行所有示例
			
 
				+    """
			
 
				+    print("\n" + "#" * 60)
			
 
				+    print("# Hadoop Tools - Python 版本示例")
			
 
				+    print("# 提供 Hadoop 数据分析能力")
			
 
				+    print("#" * 60)
			
 
				+    
			
 
				+    print("\n项目结构:")
			
 
				+    print("  python/")
			
 
				+    print("    ├── __init__.py          # 模块入口")
			
 
				+    print("    ├── hdfs_operations.py   # HDFS 操作功能")
			
 
				+    print("    ├── mapreduce/")
			
 
				+    print("    │   ├── __init__.py")
			
 
				+    print("    │   ├── wordcount_streaming.py  # Hadoop Streaming 方式")
			
 
				+    print("    │   └── wordcount_spark.py      # PySpark 方式")
			
 
				+    print("    └── utils/")
			
 
				+    print("        ├── __init__.py")
			
 
				+    print("        └── helpers.py      # 工具函数")
			
 
				+    
			
 
				+    # 运行示例
			
 
				+    example_hdfs_operations()
			
 
				+    example_wordcount_streaming()
			
 
				+    example_wordcount_spark()
			
 
				+    
			
 
				+    print("\n" + "#" * 60)
			
 
				+    print("# 示例运行完成！")
			
 
				+    print("# 请查看上方输出了解各模块的使用方法。")
			
 
				+    print("#" * 60)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/examples/sample_data.txt
+++ b/examples/sample_data.txt
@@ -0,0 +1,10 @@
 
				+Hello world, hello Hadoop!
			
 
				+Hadoop is a framework for distributed storage and processing of big data.
			
 
				+Big data is data that contains greater variety, arriving in increasing volumes and with more velocity.
			
 
				+Hadoop provides massive storage for any kind of data.
			
 
				+Hadoop also provides enormous processing power.
			
 
				+With Hadoop, you can store and process billions of records.
			
 
				+Hadoop ecosystem includes many tools like HDFS, MapReduce, YARN, Spark, etc.
			
 
				+Spark is a fast and general engine for large-scale data processing.
			
 
				+Spark can run on Hadoop, Mesos, standalone, or in the cloud.
			
 
				+Spark provides high-level APIs in Java, Scala, Python, R, and SQL.
			
--- a/python/__init__.py
+++ b/python/__init__.py
@@ -0,0 +1,27 @@
 
				+"""
			
 
				+Hadoop Tools - Python 版本
			
 
				+
			
 
				+提供 Hadoop 数据分析能力，包括：
			
 
				+- HDFS 文件系统操作
			
 
				+- MapReduce 作业执行
			
 
				+- 大数据处理工具
			
 
				+
			
 
				+模块结构：
			
 
				+- hdfs_operations: HDFS 文件系统操作
			
 
				+- mapreduce: MapReduce 作业实现
			
 
				+  - wordcount_streaming: Hadoop Streaming 方式的词频统计
			
 
				+  - wordcount_spark: PySpark 方式的词频统计
			
 
				+- utils: 工具函数
			
 
				+"""
			
 
				+
			
 
				+from .hdfs_operations import HDFSOperations
			
 
				+from .mapreduce.wordcount_streaming import WordCountStreaming
			
 
				+from .mapreduce.wordcount_spark import WordCountSpark
			
 
				+
			
 
				+__all__ = [
			
 
				+    'HDFSOperations',
			
 
				+    'WordCountStreaming',
			
 
				+    'WordCountSpark'
			
 
				+]
			
 
				+
			
 
				+__version__ = '0.1.0'
			
--- a/python/__pycache__/__init__.cpython-312.pyc
+++ b/python/__pycache__/__init__.cpython-312.pyc
--- a/python/__pycache__/hdfs_operations.cpython-312.pyc
+++ b/python/__pycache__/hdfs_operations.cpython-312.pyc
--- a/python/hdfs_operations.py
+++ b/python/hdfs_operations.py
@@ -0,0 +1,365 @@
 
				+"""
			
 
				+HDFS 文件系统操作模块
			
 
				+
			
 
				+提供与 Java 版本 CommonOperation 类相同的功能：
			
 
				+- 创建目录
			
 
				+- 删除目录/文件
			
 
				+- 上传文件
			
 
				+- 读写文件
			
 
				+- 检查文件是否存在
			
 
				+- 列出目录内容
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+from typing import List, Optional, Tuple
			
 
				+from .utils.helpers import run_command, validate_hdfs_path, setup_logger
			
 
				+
			
 
				+
			
 
				+class HDFSOperations:
			
 
				+    """
			
 
				+    HDFS 文件系统操作类
			
 
				+    
			
 
				+    封装了 Hadoop 命令行工具，提供与 HDFS 交互的各种方法。
			
 
				+    功能与 Java 版本的 CommonOperation 类相对应。
			
 
				+    """
			
 
				+    
			
 
				+    def __init__(self, hadoop_home: Optional[str] = None, logger_name: str = 'hdfs_operations'):
			
 
				+        """
			
 
				+        初始化 HDFSOperations 实例
			
 
				+        
			
 
				+        Args:
			
 
				+            hadoop_home: Hadoop 安装目录（可选，默认从环境变量获取）
			
 
				+            logger_name: 日志器名称
			
 
				+        """
			
 
				+        self.logger = setup_logger(logger_name)
			
 
				+        self.hadoop_home = hadoop_home or os.environ.get('HADOOP_HOME', '')
			
 
				+        self.hadoop_cmd = 'hdfs' if self._check_command_exists('hdfs') else 'hadoop'
			
 
				+        
			
 
				+    def _check_command_exists(self, cmd: str) -> bool:
			
 
				+        """
			
 
				+        检查命令是否存在
			
 
				+        
			
 
				+        Args:
			
 
				+            cmd: 命令名称
			
 
				+            
			
 
				+        Returns:
			
 
				+            命令是否存在
			
 
				+        """
			
 
				+        return os.system(f'which {cmd} > /dev/null 2>&1') == 0
			
 
				+    
			
 
				+    def _execute_hdfs_command(self, subcommand: str, args: List[str] = None) -> Tuple[int, str, str]:
			
 
				+        """
			
 
				+        执行 HDFS 命令
			
 
				+        
			
 
				+        Args:
			
 
				+            subcommand: HDFS 子命令（如 dfs, fs 等）
			
 
				+            args: 命令参数列表
			
 
				+            
			
 
				+        Returns:
			
 
				+            (return_code, stdout, stderr)
			
 
				+        """
			
 
				+        args = args or []
			
 
				+        cmd = f"{self.hadoop_cmd} {subcommand} {' '.join(args)}"
			
 
				+        self.logger.debug(f"Executing command: {cmd}")
			
 
				+        return run_command(cmd)
			
 
				+    
			
 
				+    def make_dir(self, path: str) -> bool:
			
 
				+        """
			
 
				+        创建目录
			
 
				+        
			
 
				+        对应 Java 版本的 makeDir 方法。
			
 
				+        
			
 
				+        Args:
			
 
				+            path: 要创建的目录路径
			
 
				+            
			
 
				+        Returns:
			
 
				+            是否创建成功
			
 
				+            
			
 
				+        Example:
			
 
				+            >>> hdfs = HDFSOperations()
			
 
				+            >>> hdfs.make_dir('/user/root/test1')
			
 
				+            True
			
 
				+        """
			
 
				+        if not validate_hdfs_path(path):
			
 
				+            self.logger.error(f"Invalid HDFS path: {path}")
			
 
				+            return False
			
 
				+        
			
 
				+        self.logger.info(f"Creating directory: {path}")
			
 
				+        returncode, stdout, stderr = self._execute_hdfs_command('dfs', ['-mkdir', '-p', path])
			
 
				+        
			
 
				+        if returncode == 0:
			
 
				+            self.logger.info(f"Successfully created directory: {path}")
			
 
				+            return True
			
 
				+        else:
			
 
				+            self.logger.error(f"Failed to create directory: {path}, Error: {stderr}")
			
 
				+            return False
			
 
				+    
			
 
				+    def delete(self, path: str, recursive: bool = True) -> bool:
			
 
				+        """
			
 
				+        删除文件或目录
			
 
				+        
			
 
				+        对应 Java 版本的 delDir 和 delFile 方法。
			
 
				+        
			
 
				+        Args:
			
 
				+            path: 要删除的路径
			
 
				+            recursive: 是否递归删除（用于目录）
			
 
				+            
			
 
				+        Returns:
			
 
				+            是否删除成功
			
 
				+            
			
 
				+        Example:
			
 
				+            >>> hdfs = HDFSOperations()
			
 
				+            >>> hdfs.delete('/user/hadoop/data/word.txt')
			
 
				+            True
			
 
				+        """
			
 
				+        if not validate_hdfs_path(path):
			
 
				+            self.logger.error(f"Invalid HDFS path: {path}")
			
 
				+            return False
			
 
				+        
			
 
				+        self.logger.info(f"Deleting: {path}")
			
 
				+        args = ['-rm', '-r'] if recursive else ['-rm']
			
 
				+        args.append(path)
			
 
				+        
			
 
				+        returncode, stdout, stderr = self._execute_hdfs_command('dfs', args)
			
 
				+        
			
 
				+        if returncode == 0:
			
 
				+            self.logger.info(f"Successfully deleted: {path}")
			
 
				+            return True
			
 
				+        else:
			
 
				+            self.logger.error(f"Failed to delete: {path}, Error: {stderr}")
			
 
				+            return False
			
 
				+    
			
 
				+    def copy_from_local(self, src: str, dst: str) -> bool:
			
 
				+        """
			
 
				+        从本地文件系统上传文件到 HDFS
			
 
				+        
			
 
				+        对应 Java 版本的 putFile 方法。
			
 
				+        
			
 
				+        Args:
			
 
				+            src: 本地文件路径
			
 
				+            dst: HDFS 目标路径
			
 
				+            
			
 
				+        Returns:
			
 
				+            是否上传成功
			
 
				+            
			
 
				+        Example:
			
 
				+            >>> hdfs = HDFSOperations()
			
 
				+            >>> hdfs.copy_from_local('/home/hadoop/word.txt', '/user/hadoop/data/')
			
 
				+            True
			
 
				+        """
			
 
				+        if not os.path.exists(src):
			
 
				+            self.logger.error(f"Local file not found: {src}")
			
 
				+            return False
			
 
				+        
			
 
				+        if not validate_hdfs_path(dst):
			
 
				+            self.logger.error(f"Invalid HDFS path: {dst}")
			
 
				+            return False
			
 
				+        
			
 
				+        self.logger.info(f"Copying from local {src} to HDFS {dst}")
			
 
				+        returncode, stdout, stderr = self._execute_hdfs_command('dfs', ['-copyFromLocal', src, dst])
			
 
				+        
			
 
				+        if returncode == 0:
			
 
				+            self.logger.info(f"Successfully copied {src} to {dst}")
			
 
				+            return True
			
 
				+        else:
			
 
				+            self.logger.error(f"Failed to copy {src} to {dst}, Error: {stderr}")
			
 
				+            return False
			
 
				+    
			
 
				+    def copy_to_local(self, src: str, dst: str) -> bool:
			
 
				+        """
			
 
				+        从 HDFS 下载文件到本地文件系统
			
 
				+        
			
 
				+        Args:
			
 
				+            src: HDFS 源路径
			
 
				+            dst: 本地目标路径
			
 
				+            
			
 
				+        Returns:
			
 
				+            是否下载成功
			
 
				+        """
			
 
				+        if not validate_hdfs_path(src):
			
 
				+            self.logger.error(f"Invalid HDFS path: {src}")
			
 
				+            return False
			
 
				+        
			
 
				+        self.logger.info(f"Copying from HDFS {src} to local {dst}")
			
 
				+        returncode, stdout, stderr = self._execute_hdfs_command('dfs', ['-copyToLocal', src, dst])
			
 
				+        
			
 
				+        if returncode == 0:
			
 
				+            self.logger.info(f"Successfully copied {src} to {dst}")
			
 
				+            return True
			
 
				+        else:
			
 
				+            self.logger.error(f"Failed to copy {src} to {dst}, Error: {stderr}")
			
 
				+            return False
			
 
				+    
			
 
				+    def read_file(self, path: str) -> Optional[str]:
			
 
				+        """
			
 
				+        读取 HDFS 文件内容
			
 
				+        
			
 
				+        对应 Java 版本的 readFile 方法。
			
 
				+        
			
 
				+        Args:
			
 
				+            path: HDFS 文件路径
			
 
				+            
			
 
				+        Returns:
			
 
				+            文件内容（字符串），如果失败返回 None
			
 
				+            
			
 
				+        Example:
			
 
				+            >>> hdfs = HDFSOperations()
			
 
				+            >>> content = hdfs.read_file('/user/hadoop/data/write.txt')
			
 
				+            >>> print(content)
			
 
				+            da jia hao,cai shi zhen de hao!
			
 
				+        """
			
 
				+        if not validate_hdfs_path(path):
			
 
				+            self.logger.error(f"Invalid HDFS path: {path}")
			
 
				+            return None
			
 
				+        
			
 
				+        if not self.exists(path):
			
 
				+            self.logger.error(f"File does not exist: {path}")
			
 
				+            return None
			
 
				+        
			
 
				+        self.logger.info(f"Reading file: {path}")
			
 
				+        returncode, stdout, stderr = self._execute_hdfs_command('dfs', ['-cat', path])
			
 
				+        
			
 
				+        if returncode == 0:
			
 
				+            self.logger.info(f"Successfully read file: {path}")
			
 
				+            return stdout
			
 
				+        else:
			
 
				+            self.logger.error(f"Failed to read file: {path}, Error: {stderr}")
			
 
				+            return None
			
 
				+    
			
 
				+    def write_file(self, path: str, content: str, overwrite: bool = True) -> bool:
			
 
				+        """
			
 
				+        写入内容到 HDFS 文件
			
 
				+        
			
 
				+        对应 Java 版本的 writeFile 方法。
			
 
				+        
			
 
				+        Args:
			
 
				+            path: HDFS 文件路径
			
 
				+            content: 要写入的内容
			
 
				+            overwrite: 是否覆盖已存在的文件
			
 
				+            
			
 
				+        Returns:
			
 
				+            是否写入成功
			
 
				+            
			
 
				+        Example:
			
 
				+            >>> hdfs = HDFSOperations()
			
 
				+            >>> hdfs.write_file('/user/hadoop/data/write.txt', 'da jia hao,cai shi zhen de hao!')
			
 
				+            True
			
 
				+        """
			
 
				+        if not validate_hdfs_path(path):
			
 
				+            self.logger.error(f"Invalid HDFS path: {path}")
			
 
				+            return False
			
 
				+        
			
 
				+        self.logger.info(f"Writing to file: {path}")
			
 
				+        
			
 
				+        # 创建临时文件
			
 
				+        import tempfile
			
 
				+        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as temp_file:
			
 
				+            temp_file.write(content)
			
 
				+            temp_path = temp_file.name
			
 
				+        
			
 
				+        try:
			
 
				+            # 使用 put 命令上传临时文件
			
 
				+            args = ['-put']
			
 
				+            if overwrite:
			
 
				+                args.append('-f')
			
 
				+            args.extend([temp_path, path])
			
 
				+            
			
 
				+            returncode, stdout, stderr = self._execute_hdfs_command('dfs', args)
			
 
				+            
			
 
				+            if returncode == 0:
			
 
				+                self.logger.info(f"Successfully wrote to file: {path}")
			
 
				+                return True
			
 
				+            else:
			
 
				+                self.logger.error(f"Failed to write to file: {path}, Error: {stderr}")
			
 
				+                return False
			
 
				+        finally:
			
 
				+            # 清理临时文件
			
 
				+            if os.path.exists(temp_path):
			
 
				+                os.unlink(temp_path)
			
 
				+    
			
 
				+    def exists(self, path: str) -> bool:
			
 
				+        """
			
 
				+        检查 HDFS 路径是否存在
			
 
				+        
			
 
				+        Args:
			
 
				+            path: HDFS 路径
			
 
				+            
			
 
				+        Returns:
			
 
				+            路径是否存在
			
 
				+        """
			
 
				+        if not validate_hdfs_path(path):
			
 
				+            return False
			
 
				+        
			
 
				+        returncode, _, _ = self._execute_hdfs_command('dfs', ['-test', '-e', path])
			
 
				+        return returncode == 0
			
 
				+    
			
 
				+    def list_dir(self, path: str) -> List[str]:
			
 
				+        """
			
 
				+        列出 HDFS 目录内容
			
 
				+        
			
 
				+        Args:
			
 
				+            path: HDFS 目录路径
			
 
				+            
			
 
				+        Returns:
			
 
				+            目录内容列表
			
 
				+        """
			
 
				+        if not validate_hdfs_path(path):
			
 
				+            self.logger.error(f"Invalid HDFS path: {path}")
			
 
				+            return []
			
 
				+        
			
 
				+        if not self.exists(path):
			
 
				+            self.logger.error(f"Directory does not exist: {path}")
			
 
				+            return []
			
 
				+        
			
 
				+        returncode, stdout, stderr = self._execute_hdfs_command('dfs', ['-ls', path])
			
 
				+        
			
 
				+        if returncode == 0:
			
 
				+            # 解析输出，提取文件名
			
 
				+            lines = stdout.strip().split('\n')
			
 
				+            # 跳过第一行（如果是目录列表的标题）
			
 
				+            if len(lines) > 0 and lines[0].startswith('Found'):
			
 
				+                lines = lines[1:]
			
 
				+            
			
 
				+            # 提取文件名（每一行的最后一个字段）
			
 
				+            files = []
			
 
				+            for line in lines:
			
 
				+                parts = line.split()
			
 
				+                if len(parts) >= 8:
			
 
				+                    files.append(parts[-1])
			
 
				+            return files
			
 
				+        else:
			
 
				+            self.logger.error(f"Failed to list directory: {path}, Error: {stderr}")
			
 
				+            return []
			
 
				+    
			
 
				+    def get_file_size(self, path: str) -> Optional[int]:
			
 
				+        """
			
 
				+        获取 HDFS 文件大小
			
 
				+        
			
 
				+        Args:
			
 
				+            path: HDFS 文件路径
			
 
				+            
			
 
				+        Returns:
			
 
				+            文件大小（字节），如果失败返回 None
			
 
				+        """
			
 
				+        if not validate_hdfs_path(path):
			
 
				+            self.logger.error(f"Invalid HDFS path: {path}")
			
 
				+            return None
			
 
				+        
			
 
				+        if not self.exists(path):
			
 
				+            self.logger.error(f"File does not exist: {path}")
			
 
				+            return None
			
 
				+        
			
 
				+        returncode, stdout, stderr = self._execute_hdfs_command('dfs', ['-du', '-s', path])
			
 
				+        
			
 
				+        if returncode == 0:
			
 
				+            # 解析输出，提取文件大小
			
 
				+            parts = stdout.strip().split()
			
 
				+            if len(parts) >= 1:
			
 
				+                try:
			
 
				+                    return int(parts[0])
			
 
				+                except ValueError:
			
 
				+                    self.logger.error(f"Failed to parse file size: {stdout}")
			
 
				+                    return None
			
 
				+        else:
			
 
				+            self.logger.error(f"Failed to get file size: {path}, Error: {stderr}")
			
 
				+            return None
			
--- a/python/mapreduce/__init__.py
+++ b/python/mapreduce/__init__.py
@@ -0,0 +1,15 @@
 
				+"""
			
 
				+MapReduce 相关模块
			
 
				+
			
 
				+提供多种 MapReduce 作业实现方式：
			
 
				+- Hadoop Streaming: 使用标准输入输出与 Hadoop 交互
			
 
				+- PySpark: 现代、高效的大数据处理框架
			
 
				+"""
			
 
				+
			
 
				+from .wordcount_streaming import WordCountStreaming
			
 
				+from .wordcount_spark import WordCountSpark
			
 
				+
			
 
				+__all__ = [
			
 
				+    'WordCountStreaming',
			
 
				+    'WordCountSpark'
			
 
				+]
			
--- a/python/mapreduce/__pycache__/__init__.cpython-312.pyc
+++ b/python/mapreduce/__pycache__/__init__.cpython-312.pyc
--- a/python/mapreduce/__pycache__/wordcount_spark.cpython-312.pyc
+++ b/python/mapreduce/__pycache__/wordcount_spark.cpython-312.pyc
--- a/python/mapreduce/__pycache__/wordcount_streaming.cpython-312.pyc
+++ b/python/mapreduce/__pycache__/wordcount_streaming.cpython-312.pyc
--- a/python/mapreduce/wordcount_spark.py
+++ b/python/mapreduce/wordcount_spark.py
@@ -0,0 +1,383 @@
 
				+"""
			
 
				+PySpark 方式的词频统计模块
			
 
				+
			
 
				+使用 PySpark 实现词频统计，这是现代大数据处理的推荐方式：
			
 
				+- 更简洁的 API
			
 
				+- 更好的性能
			
 
				+- 支持更多的数据处理操作
			
 
				+- 可以与 Spark SQL、MLlib 等集成
			
 
				+
			
 
				+对应 Java 版本的 WordCount 类，但使用更现代的 Spark 框架。
			
 
				+
			
 
				+使用方式：
			
 
				+1. 作为模块导入使用：
			
 
				+   from wordcount_spark import WordCountSpark
			
 
				+   wc = WordCountSpark()
			
 
				+   result = wc.run(input_path, output_path)
			
 
				+
			
 
				+2. 作为独立脚本运行：
			
 
				+   $ python wordcount_spark.py <input_path> <output_path>
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+from typing import Dict, List, Optional, Tuple
			
 
				+from collections import defaultdict
			
 
				+from ..utils.helpers import setup_logger, format_file_size
			
 
				+
			
 
				+
			
 
				+class WordCountSpark:
			
 
				+    """
			
 
				+    PySpark 方式的词频统计类
			
 
				+    
			
 
				+    封装了 PySpark 作业的执行，提供高效的词频统计功能。
			
 
				+    """
			
 
				+    
			
 
				+    def __init__(self, app_name: str = 'WordCount', 
			
 
				+                 master: Optional[str] = None,
			
 
				+                 logger_name: str = 'wordcount_spark'):
			
 
				+        """
			
 
				+        初始化 WordCountSpark 实例
			
 
				+        
			
 
				+        Args:
			
 
				+            app_name: Spark 应用名称
			
 
				+            master: Spark 主节点 URL（可选，如 'local[*]', 'spark://master:7077'）
			
 
				+                    如果为 None，Spark 会从配置中自动获取
			
 
				+            logger_name: 日志器名称
			
 
				+        """
			
 
				+        self.logger = setup_logger(logger_name)
			
 
				+        self.app_name = app_name
			
 
				+        self.master = master
			
 
				+        self.spark = None
			
 
				+        self.sc = None
			
 
				+        
			
 
				+    def _init_spark(self):
			
 
				+        """
			
 
				+        初始化 Spark 会话和上下文
			
 
				+        
			
 
				+        延迟初始化，只有在需要时才创建 Spark 实例。
			
 
				+        """
			
 
				+        if self.spark is not None:
			
 
				+            return
			
 
				+        
			
 
				+        try:
			
 
				+            from pyspark.sql import SparkSession
			
 
				+            
			
 
				+            builder = SparkSession.builder.appName(self.app_name)
			
 
				+            if self.master:
			
 
				+                builder = builder.master(self.master)
			
 
				+            
			
 
				+            # 配置一些常用参数
			
 
				+            builder = builder.config("spark.sql.shuffle.partitions", "2")
			
 
				+            builder = builder.config("spark.driver.memory", "1g")
			
 
				+            builder = builder.config("spark.executor.memory", "1g")
			
 
				+            
			
 
				+            self.spark = builder.getOrCreate()
			
 
				+            self.sc = self.spark.sparkContext
			
 
				+            
			
 
				+            self.logger.info(f"Spark session initialized: {self.app_name}")
			
 
				+            self.logger.info(f"Spark master: {self.sc.master}")
			
 
				+            self.logger.info(f"Spark version: {self.sc.version}")
			
 
				+            
			
 
				+        except ImportError as e:
			
 
				+            self.logger.error(f"PySpark is not installed: {e}")
			
 
				+            raise
			
 
				+        except Exception as e:
			
 
				+            self.logger.error(f"Failed to initialize Spark: {e}")
			
 
				+            raise
			
 
				+    
			
 
				+    def stop(self):
			
 
				+        """
			
 
				+        停止 Spark 会话
			
 
				+        """
			
 
				+        if self.spark:
			
 
				+            self.spark.stop()
			
 
				+            self.spark = None
			
 
				+            self.sc = None
			
 
				+            self.logger.info("Spark session stopped")
			
 
				+    
			
 
				+    def count_words_from_rdd(self, text_rdd) -> Dict[str, int]:
			
 
				+        """
			
 
				+        从 RDD 统计单词
			
 
				+        
			
 
				+        对应 Java 版本的 WordCount 逻辑，但使用 Spark 的算子。
			
 
				+        
			
 
				+        Args:
			
 
				+            text_rdd: 包含文本的 RDD
			
 
				+            
			
 
				+        Returns:
			
 
				+            单词计数字典
			
 
				+        """
			
 
				+        # 1. 分割每行文本为单词
			
 
				+        # 对应 Java 的 TokenizerMapper.map 方法
			
 
				+        words_rdd = text_rdd.flatMap(self._split_line)
			
 
				+        
			
 
				+        # 2. 映射为 (单词, 1)
			
 
				+        pairs_rdd = words_rdd.map(lambda word: (word, 1))
			
 
				+        
			
 
				+        # 3. 按单词聚合计数
			
 
				+        # 对应 Java 的 IntSumReducer.reduce 方法
			
 
				+        word_counts_rdd = pairs_rdd.reduceByKey(lambda x, y: x + y)
			
 
				+        
			
 
				+        # 4. 收集结果到本地
			
 
				+        result = word_counts_rdd.collectAsMap()
			
 
				+        
			
 
				+        return dict(result)
			
 
				+    
			
 
				+    def _split_line(self, line: str) -> List[str]:
			
 
				+        """
			
 
				+        分割一行文本为单词列表
			
 
				+        
			
 
				+        Args:
			
 
				+            line: 输入文本行
			
 
				+            
			
 
				+        Returns:
			
 
				+            单词列表
			
 
				+        """
			
 
				+        words = []
			
 
				+        # 分割文本为单词（使用空格、制表符等分隔符）
			
 
				+        raw_words = line.strip().split()
			
 
				+        for word in raw_words:
			
 
				+            # 清理单词（移除标点符号，转为小写）
			
 
				+            word = word.strip('.,!?;:()[]{}"\'').lower()
			
 
				+            if word:  # 确保单词非空
			
 
				+                words.append(word)
			
 
				+        return words
			
 
				+    
			
 
				+    def count_words_from_dataframe(self, df, text_column: str = 'value') -> Dict[str, int]:
			
 
				+        """
			
 
				+        从 DataFrame 统计单词（使用 Spark SQL 风格）
			
 
				+        
			
 
				+        更高级的 API，适合复杂的数据处理。
			
 
				+        
			
 
				+        Args:
			
 
				+            df: 包含文本的 DataFrame
			
 
				+            text_column: 包含文本的列名
			
 
				+            
			
 
				+        Returns:
			
 
				+            单词计数字典
			
 
				+        """
			
 
				+        from pyspark.sql.functions import explode, split, lower, trim, regexp_replace, col, count
			
 
				+        
			
 
				+        # 1. 清理文本（移除标点符号，转为小写）
			
 
				+        df_clean = df.withColumn(
			
 
				+            'clean_text',
			
 
				+            lower(trim(regexp_replace(col(text_column), '[^a-zA-Z0-9\\s]', ' ')))
			
 
				+        )
			
 
				+        
			
 
				+        # 2. 分割为单词
			
 
				+        df_words = df_clean.withColumn(
			
 
				+            'word',
			
 
				+            explode(split(col('clean_text'), '\\s+'))
			
 
				+        )
			
 
				+        
			
 
				+        # 3. 过滤空单词
			
 
				+        df_filtered = df_words.filter(col('word') != '')
			
 
				+        
			
 
				+        # 4. 按单词分组计数
			
 
				+        df_counts = df_filtered.groupBy('word').agg(count('*').alias('count'))
			
 
				+        
			
 
				+        # 5. 收集结果
			
 
				+        result = {row['word']: row['count'] for row in df_counts.collect()}
			
 
				+        
			
 
				+        return result
			
 
				+    
			
 
				+    def run(self, input_path: str, output_path: Optional[str] = None,
			
 
				+            use_dataframe: bool = True) -> Dict[str, int]:
			
 
				+        """
			
 
				+        运行完整的 WordCount 作业
			
 
				+        
			
 
				+        Args:
			
 
				+            input_path: 输入路径（可以是本地文件路径或 HDFS 路径）
			
 
				+            output_path: 输出路径（可选，如果指定则保存结果）
			
 
				+            use_dataframe: 是否使用 DataFrame API（否则使用 RDD API）
			
 
				+            
			
 
				+        Returns:
			
 
				+            单词计数字典
			
 
				+        """
			
 
				+        self._init_spark()
			
 
				+        
			
 
				+        self.logger.info(f"Running WordCount job on: {input_path}")
			
 
				+        
			
 
				+        if use_dataframe:
			
 
				+            # 使用 DataFrame API
			
 
				+            df = self.spark.read.text(input_path)
			
 
				+            result = self.count_words_from_dataframe(df)
			
 
				+        else:
			
 
				+            # 使用 RDD API
			
 
				+            text_rdd = self.sc.textFile(input_path)
			
 
				+            result = self.count_words_from_rdd(text_rdd)
			
 
				+        
			
 
				+        # 保存结果（如果指定了输出路径）
			
 
				+        if output_path:
			
 
				+            self._save_result(result, output_path)
			
 
				+        
			
 
				+        # 打印统计信息
			
 
				+        self._print_statistics(result)
			
 
				+        
			
 
				+        return result
			
 
				+    
			
 
				+    def _save_result(self, result: Dict[str, int], output_path: str):
			
 
				+        """
			
 
				+        保存结果到文件
			
 
				+        
			
 
				+        Args:
			
 
				+            result: 单词计数字典
			
 
				+            output_path: 输出路径
			
 
				+        """
			
 
				+        self.logger.info(f"Saving results to: {output_path}")
			
 
				+        
			
 
				+        # 转换为 RDD 并保存
			
 
				+        result_rdd = self.sc.parallelize([
			
 
				+            f"{word}\t{count}" 
			
 
				+            for word, count in sorted(result.items())
			
 
				+        ])
			
 
				+        result_rdd.saveAsTextFile(output_path)
			
 
				+        
			
 
				+        self.logger.info(f"Results saved to: {output_path}")
			
 
				+    
			
 
				+    def _print_statistics(self, result: Dict[str, int]):
			
 
				+        """
			
 
				+        打印统计信息
			
 
				+        
			
 
				+        Args:
			
 
				+            result: 单词计数字典
			
 
				+        """
			
 
				+        if not result:
			
 
				+            self.logger.info("No words found")
			
 
				+            return
			
 
				+        
			
 
				+        total_words = sum(result.values())
			
 
				+        unique_words = len(result)
			
 
				+        sorted_words = sorted(result.items(), key=lambda x: x[1], reverse=True)
			
 
				+        
			
 
				+        self.logger.info("=" * 50)
			
 
				+        self.logger.info("WordCount Statistics")
			
 
				+        self.logger.info("=" * 50)
			
 
				+        self.logger.info(f"Total words: {total_words}")
			
 
				+        self.logger.info(f"Unique words: {unique_words}")
			
 
				+        self.logger.info("-" * 50)
			
 
				+        self.logger.info("Top 10 words:")
			
 
				+        
			
 
				+        for i, (word, count) in enumerate(sorted_words[:10], 1):
			
 
				+            percentage = (count / total_words) * 100
			
 
				+            self.logger.info(f"  {i:2d}. {word:15s} {count:5d} ({percentage:5.1f}%)")
			
 
				+        
			
 
				+        self.logger.info("=" * 50)
			
 
				+    
			
 
				+    def count_words_locally(self, text: str) -> Dict[str, int]:
			
 
				+        """
			
 
				+        本地统计单词（不使用 Spark 集群）
			
 
				+        
			
 
				+        用于测试和小规模数据处理。
			
 
				+        
			
 
				+        Args:
			
 
				+            text: 输入文本
			
 
				+            
			
 
				+        Returns:
			
 
				+            单词计数字典
			
 
				+            
			
 
				+        Example:
			
 
				+            >>> wc = WordCountSpark()
			
 
				+            >>> wc.count_words_locally("hello world hello")
			
 
				+            {'hello': 2, 'world': 1}
			
 
				+        """
			
 
				+        word_counts = defaultdict(int)
			
 
				+        
			
 
				+        for line in text.split('\n'):
			
 
				+            words = self._split_line(line)
			
 
				+            for word in words:
			
 
				+                word_counts[word] += 1
			
 
				+        
			
 
				+        return dict(word_counts)
			
 
				+    
			
 
				+    def run_with_files(self, files: List[str], output_path: Optional[str] = None) -> Dict[str, int]:
			
 
				+        """
			
 
				+        对多个文件运行词频统计
			
 
				+        
			
 
				+        Args:
			
 
				+            files: 文件路径列表
			
 
				+            output_path: 输出路径（可选）
			
 
				+            
			
 
				+        Returns:
			
 
				+            单词计数字典
			
 
				+        """
			
 
				+        # 合并所有文件的内容
			
 
				+        all_text = ""
			
 
				+        for file_path in files:
			
 
				+            try:
			
 
				+                with open(file_path, 'r', encoding='utf-8') as f:
			
 
				+                    all_text += f.read() + "\n"
			
 
				+            except Exception as e:
			
 
				+                self.logger.warning(f"Failed to read file {file_path}: {e}")
			
 
				+        
			
 
				+        # 本地统计
			
 
				+        result = self.count_words_locally(all_text)
			
 
				+        
			
 
				+        # 保存结果
			
 
				+        if output_path:
			
 
				+            with open(output_path, 'w', encoding='utf-8') as f:
			
 
				+                for word, count in sorted(result.items()):
			
 
				+                    f.write(f"{word}\t{count}\n")
			
 
				+        
			
 
				+        # 打印统计信息
			
 
				+        self._print_statistics(result)
			
 
				+        
			
 
				+        return result
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """
			
 
				+    主函数：作为独立脚本运行
			
 
				+    
			
 
				+    使用方式：
			
 
				+    python wordcount_spark.py <input_path> [output_path]
			
 
				+    """
			
 
				+    if len(sys.argv) < 2:
			
 
				+        print("Usage: python wordcount_spark.py <input_path> [output_path]")
			
 
				+        print("Examples:")
			
 
				+        print("  python wordcount_spark.py input.txt")
			
 
				+        print("  python wordcount_spark.py hdfs:///user/hadoop/data output")
			
 
				+        print("  python wordcount_spark.py --local input.txt output.txt")
			
 
				+        sys.exit(1)
			
 
				+    
			
 
				+    # 解析参数
			
 
				+    use_local = False
			
 
				+    input_path = None
			
 
				+    output_path = None
			
 
				+    
			
 
				+    i = 1
			
 
				+    while i < len(sys.argv):
			
 
				+        arg = sys.argv[i]
			
 
				+        if arg == '--local':
			
 
				+            use_local = True
			
 
				+        elif input_path is None:
			
 
				+            input_path = arg
			
 
				+        else:
			
 
				+            output_path = arg
			
 
				+        i += 1
			
 
				+    
			
 
				+    if input_path is None:
			
 
				+        print("Error: Input path is required")
			
 
				+        sys.exit(1)
			
 
				+    
			
 
				+    wc = WordCountSpark()
			
 
				+    
			
 
				+    try:
			
 
				+        if use_local:
			
 
				+            # 本地模式（不使用 Spark）
			
 
				+            result = wc.run_with_files([input_path], output_path)
			
 
				+        else:
			
 
				+            # Spark 模式
			
 
				+            result = wc.run(input_path, output_path)
			
 
				+        
			
 
				+        # 打印结果
			
 
				+        print("\nFinal results:")
			
 
				+        for word, count in sorted(result.items(), key=lambda x: x[1], reverse=True)[:20]:
			
 
				+            print(f"{word}: {count}")
			
 
				+    
			
 
				+    finally:
			
 
				+        wc.stop()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/python/mapreduce/wordcount_streaming.py
+++ b/python/mapreduce/wordcount_streaming.py
@@ -0,0 +1,346 @@
 
				+"""
			
 
				+Hadoop Streaming 方式的词频统计模块
			
 
				+
			
 
				+对应 Java 版本的 WordCount 类，使用 Hadoop Streaming 方式实现：
			
 
				+- Mapper: 从标准输入读取数据，分割为单词，输出 <单词, 1>
			
 
				+- Reducer: 从标准输入读取 Mapper 输出，统计每个单词的总次数
			
 
				+- Combiner: 可选的本地合并，减少数据传输
			
 
				+
			
 
				+使用方式：
			
 
				+1. 作为独立脚本运行（用于 Hadoop Streaming）：
			
 
				+   $ python wordcount_streaming.py mapper < input.txt
			
 
				+   $ python wordcount_streaming.py reducer < mapper_output.txt
			
 
				+
			
 
				+2. 作为模块导入使用：
			
 
				+   from wordcount_streaming import WordCountStreaming
			
 
				+   wc = WordCountStreaming()
			
 
				+   wc.run(input_path, output_path)
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+from collections import defaultdict
			
 
				+from typing import Dict, List, Optional, Tuple
			
 
				+from ..utils.helpers import run_command, setup_logger
			
 
				+
			
 
				+
			
 
				+class WordCountStreaming:
			
 
				+    """
			
 
				+    Hadoop Streaming 方式的词频统计类
			
 
				+    
			
 
				+    封装了 Hadoop Streaming 作业的执行，提供与 Java 版本 WordCount 类类似的功能。
			
 
				+    """
			
 
				+    
			
 
				+    def __init__(self, hadoop_home: Optional[str] = None, logger_name: str = 'wordcount_streaming'):
			
 
				+        """
			
 
				+        初始化 WordCountStreaming 实例
			
 
				+        
			
 
				+        Args:
			
 
				+            hadoop_home: Hadoop 安装目录（可选，默认从环境变量获取）
			
 
				+            logger_name: 日志器名称
			
 
				+        """
			
 
				+        self.logger = setup_logger(logger_name)
			
 
				+        self.hadoop_home = hadoop_home or __import__('os').environ.get('HADOOP_HOME', '')
			
 
				+        self.hadoop_cmd = 'hadoop'
			
 
				+        
			
 
				+    def mapper(self, line: str) -> List[Tuple[str, int]]:
			
 
				+        """
			
 
				+        Mapper 函数：将一行文本分割为单词，输出 <单词, 1>
			
 
				+        
			
 
				+        对应 Java 版本的 TokenizerMapper.map 方法。
			
 
				+        
			
 
				+        Args:
			
 
				+            line: 输入的一行文本
			
 
				+            
			
 
				+        Returns:
			
 
				+            单词和计数的元组列表
			
 
				+            
			
 
				+        Example:
			
 
				+            >>> wc = WordCountStreaming()
			
 
				+            >>> wc.mapper("hello world hello")
			
 
				+            [('hello', 1), ('world', 1), ('hello', 1)]
			
 
				+        """
			
 
				+        results = []
			
 
				+        # 分割文本为单词（使用空格、制表符等分隔符）
			
 
				+        words = line.strip().split()
			
 
				+        for word in words:
			
 
				+            # 清理单词（移除标点符号，转为小写）
			
 
				+            word = word.strip('.,!?;:()[]{}"\'').lower()
			
 
				+            if word:  # 确保单词非空
			
 
				+                results.append((word, 1))
			
 
				+        return results
			
 
				+    
			
 
				+    def combiner(self, pairs: List[Tuple[str, int]]) -> List[Tuple[str, int]]:
			
 
				+        """
			
 
				+        Combiner 函数：在 Mapper 端进行本地合并
			
 
				+        
			
 
				+        对应 Java 版本的 Combiner 功能，减少数据传输量。
			
 
				+        
			
 
				+        Args:
			
 
				+            pairs: Mapper 输出的 <单词, 1> 列表
			
 
				+            
			
 
				+        Returns:
			
 
				+            合并后的 <单词, 本地计数> 列表
			
 
				+            
			
 
				+        Example:
			
 
				+            >>> wc = WordCountStreaming()
			
 
				+            >>> wc.combiner([('hello', 1), ('world', 1), ('hello', 1)])
			
 
				+            [('hello', 2), ('world', 1)]
			
 
				+        """
			
 
				+        word_counts = defaultdict(int)
			
 
				+        for word, count in pairs:
			
 
				+            word_counts[word] += count
			
 
				+        return [(word, count) for word, count in word_counts.items()]
			
 
				+    
			
 
				+    def reducer(self, word: str, counts: List[int]) -> Tuple[str, int]:
			
 
				+        """
			
 
				+        Reducer 函数：统计每个单词的总次数
			
 
				+        
			
 
				+        对应 Java 版本的 IntSumReducer.reduce 方法。
			
 
				+        
			
 
				+        Args:
			
 
				+            word: 单词
			
 
				+            counts: 该单词的所有计数列表
			
 
				+            
			
 
				+        Returns:
			
 
				+            <单词, 总次数> 元组
			
 
				+            
			
 
				+        Example:
			
 
				+            >>> wc = WordCountStreaming()
			
 
				+            >>> wc.reducer('hello', [1, 1, 1])
			
 
				+            ('hello', 3)
			
 
				+        """
			
 
				+        total = sum(counts)
			
 
				+        return (word, total)
			
 
				+    
			
 
				+    def run_mapper_from_stdin(self):
			
 
				+        """
			
 
				+        从标准输入运行 Mapper（用于 Hadoop Streaming）
			
 
				+        
			
 
				+        从 stdin 读取每行数据，执行 Mapper 逻辑，输出到 stdout。
			
 
				+        """
			
 
				+        for line in sys.stdin:
			
 
				+            pairs = self.mapper(line)
			
 
				+            for word, count in pairs:
			
 
				+                print(f"{word}\t{count}")
			
 
				+    
			
 
				+    def run_reducer_from_stdin(self):
			
 
				+        """
			
 
				+        从标准输入运行 Reducer（用于 Hadoop Streaming）
			
 
				+        
			
 
				+        从 stdin 读取 Mapper 输出，执行 Reducer 逻辑，输出到 stdout。
			
 
				+        假设输入已经按键排序（Hadoop Streaming 会自动排序）。
			
 
				+        """
			
 
				+        current_word = None
			
 
				+        current_counts = []
			
 
				+        
			
 
				+        for line in sys.stdin:
			
 
				+            line = line.strip()
			
 
				+            if not line:
			
 
				+                continue
			
 
				+            
			
 
				+            # 解析输入：单词\t计数
			
 
				+            parts = line.split('\t', 1)
			
 
				+            if len(parts) != 2:
			
 
				+                continue
			
 
				+            
			
 
				+            word, count_str = parts
			
 
				+            try:
			
 
				+                count = int(count_str)
			
 
				+            except ValueError:
			
 
				+                continue
			
 
				+            
			
 
				+            # 处理相同单词的计数
			
 
				+            if current_word == word:
			
 
				+                current_counts.append(count)
			
 
				+            else:
			
 
				+                # 输出前一个单词的结果
			
 
				+                if current_word is not None:
			
 
				+                    result_word, result_count = self.reducer(current_word, current_counts)
			
 
				+                    print(f"{result_word}\t{result_count}")
			
 
				+                
			
 
				+                # 开始处理新单词
			
 
				+                current_word = word
			
 
				+                current_counts = [count]
			
 
				+        
			
 
				+        # 输出最后一个单词的结果
			
 
				+        if current_word is not None:
			
 
				+            result_word, result_count = self.reducer(current_word, current_counts)
			
 
				+            print(f"{result_word}\t{result_count}")
			
 
				+    
			
 
				+    def run(self, input_path: str, output_path: str, 
			
 
				+            mapper_script: Optional[str] = None,
			
 
				+            reducer_script: Optional[str] = None,
			
 
				+            combiner: bool = True,
			
 
				+            num_reducers: int = 1) -> bool:
			
 
				+        """
			
 
				+        运行完整的 WordCount 作业
			
 
				+        
			
 
				+        使用 Hadoop Streaming 提交作业到 Hadoop 集群。
			
 
				+        
			
 
				+        Args:
			
 
				+            input_path: HDFS 输入路径
			
 
				+            output_path: HDFS 输出路径（不能已存在）
			
 
				+            mapper_script: Mapper 脚本路径（可选，默认使用当前脚本）
			
 
				+            reducer_script: Reducer 脚本路径（可选，默认使用当前脚本）
			
 
				+            combiner: 是否使用 Combiner
			
 
				+            num_reducers: Reducer 任务数量
			
 
				+            
			
 
				+        Returns:
			
 
				+            作业是否成功完成
			
 
				+        """
			
 
				+        import os
			
 
				+        
			
 
				+        # 确定脚本路径
			
 
				+        if mapper_script is None:
			
 
				+            mapper_script = __file__
			
 
				+        if reducer_script is None:
			
 
				+            reducer_script = __file__
			
 
				+        
			
 
				+        # 构建 Hadoop Streaming 命令
			
 
				+        streaming_jar = self._find_streaming_jar()
			
 
				+        if not streaming_jar:
			
 
				+            self.logger.error("Could not find Hadoop Streaming jar")
			
 
				+            return False
			
 
				+        
			
 
				+        cmd_parts = [
			
 
				+            self.hadoop_cmd,
			
 
				+            'jar', streaming_jar,
			
 
				+            '-files', f"{mapper_script},{reducer_script}",
			
 
				+            '-mapper', f"python3 {os.path.basename(mapper_script)} mapper",
			
 
				+            '-reducer', f"python3 {os.path.basename(reducer_script)} reducer",
			
 
				+            '-input', input_path,
			
 
				+            '-output', output_path,
			
 
				+            '-D', f"mapreduce.job.reduces={num_reducers}"
			
 
				+        ]
			
 
				+        
			
 
				+        if combiner:
			
 
				+            cmd_parts.extend(['-combiner', f"python3 {os.path.basename(mapper_script)} mapper | sort | python3 {os.path.basename(reducer_script)} reducer"])
			
 
				+        
			
 
				+        cmd = ' '.join(cmd_parts)
			
 
				+        self.logger.info(f"Running Hadoop Streaming job: {cmd}")
			
 
				+        
			
 
				+        returncode, stdout, stderr = run_command(cmd, timeout=3600)  # 1小时超时
			
 
				+        
			
 
				+        if returncode == 0:
			
 
				+            self.logger.info("WordCount job completed successfully")
			
 
				+            self.logger.info(f"Output: {stdout}")
			
 
				+            return True
			
 
				+        else:
			
 
				+            self.logger.error(f"WordCount job failed with return code {returncode}")
			
 
				+            self.logger.error(f"Stderr: {stderr}")
			
 
				+            return False
			
 
				+    
			
 
				+    def _find_streaming_jar(self) -> Optional[str]:
			
 
				+        """
			
 
				+        查找 Hadoop Streaming jar 文件
			
 
				+        
			
 
				+        Returns:
			
 
				+            Streaming jar 文件路径，如果未找到返回 None
			
 
				+        """
			
 
				+        import os
			
 
				+        import glob
			
 
				+        
			
 
				+        # 尝试从常见位置查找
			
 
				+        search_paths = [
			
 
				+            os.path.join(self.hadoop_home, 'share', 'hadoop', 'tools', 'lib'),
			
 
				+            os.path.join(self.hadoop_home, 'contrib', 'streaming'),
			
 
				+            '/usr/lib/hadoop-mapreduce',
			
 
				+            '/usr/hdp/current/hadoop-mapreduce-client'
			
 
				+        ]
			
 
				+        
			
 
				+        for path in search_paths:
			
 
				+            if os.path.exists(path):
			
 
				+                jars = glob.glob(os.path.join(path, 'hadoop-streaming-*.jar'))
			
 
				+                if jars:
			
 
				+                    return jars[0]
			
 
				+        
			
 
				+        # 尝试使用 hadoop classpath 查找
			
 
				+        returncode, stdout, stderr = run_command(f"{self.hadoop_cmd} classpath --glob")
			
 
				+        if returncode == 0:
			
 
				+            # 解析 classpath，查找 streaming jar
			
 
				+            classpath = stdout.strip()
			
 
				+            for part in classpath.split(os.pathsep):
			
 
				+                if 'streaming' in part.lower() and part.endswith('.jar'):
			
 
				+                    return part
			
 
				+        
			
 
				+        return None
			
 
				+    
			
 
				+    def count_words_locally(self, text: str) -> Dict[str, int]:
			
 
				+        """
			
 
				+        本地统计单词（不使用 Hadoop）
			
 
				+        
			
 
				+        用于测试和小规模数据处理。
			
 
				+        
			
 
				+        Args:
			
 
				+            text: 输入文本
			
 
				+            
			
 
				+        Returns:
			
 
				+            单词计数字典
			
 
				+            
			
 
				+        Example:
			
 
				+            >>> wc = WordCountStreaming()
			
 
				+            >>> wc.count_words_locally("hello world hello")
			
 
				+            {'hello': 2, 'world': 1}
			
 
				+        """
			
 
				+        # 模拟完整的 MapReduce 流程
			
 
				+        all_pairs = []
			
 
				+        for line in text.split('\n'):
			
 
				+            pairs = self.mapper(line)
			
 
				+            all_pairs.extend(pairs)
			
 
				+        
			
 
				+        # 按单词分组
			
 
				+        word_groups = defaultdict(list)
			
 
				+        for word, count in all_pairs:
			
 
				+            word_groups[word].append(count)
			
 
				+        
			
 
				+        # 执行 Reduce
			
 
				+        results = {}
			
 
				+        for word, counts in word_groups.items():
			
 
				+            _, total = self.reducer(word, counts)
			
 
				+            results[word] = total
			
 
				+        
			
 
				+        return results
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    """
			
 
				+    主函数：作为独立脚本运行
			
 
				+    
			
 
				+    支持的命令：
			
 
				+    - mapper: 运行 Mapper
			
 
				+    - reducer: 运行 Reducer
			
 
				+    - local: 本地测试
			
 
				+    """
			
 
				+    if len(sys.argv) < 2:
			
 
				+        print("Usage: python wordcount_streaming.py <command>")
			
 
				+        print("Commands:")
			
 
				+        print("  mapper   - Run Mapper from stdin")
			
 
				+        print("  reducer  - Run Reducer from stdin")
			
 
				+        print("  local    - Run local test")
			
 
				+        sys.exit(1)
			
 
				+    
			
 
				+    command = sys.argv[1]
			
 
				+    wc = WordCountStreaming()
			
 
				+    
			
 
				+    if command == 'mapper':
			
 
				+        wc.run_mapper_from_stdin()
			
 
				+    elif command == 'reducer':
			
 
				+        wc.run_reducer_from_stdin()
			
 
				+    elif command == 'local':
			
 
				+        # 本地测试
			
 
				+        test_text = """
			
 
				+        Hello world, hello Hadoop!
			
 
				+        Hadoop is great for big data.
			
 
				+        Big data processing with Hadoop.
			
 
				+        """
			
 
				+        result = wc.count_words_locally(test_text)
			
 
				+        print("Word count results:")
			
 
				+        for word, count in sorted(result.items(), key=lambda x: x[1], reverse=True):
			
 
				+            print(f"{word}: {count}")
			
 
				+    else:
			
 
				+        print(f"Unknown command: {command}")
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/python/utils/__init__.py
+++ b/python/utils/__init__.py
@@ -0,0 +1,22 @@
 
				+"""
			
 
				+工具函数模块
			
 
				+
			
 
				+提供常用的辅助功能：
			
 
				+- 命令行执行工具
			
 
				+- 文件处理工具
			
 
				+- 日志工具
			
 
				+"""
			
 
				+
			
 
				+from .helpers import (
			
 
				+    run_command,
			
 
				+    validate_hdfs_path,
			
 
				+    format_file_size,
			
 
				+    setup_logger
			
 
				+)
			
 
				+
			
 
				+__all__ = [
			
 
				+    'run_command',
			
 
				+    'validate_hdfs_path',
			
 
				+    'format_file_size',
			
 
				+    'setup_logger'
			
 
				+]
			
--- a/python/utils/__pycache__/__init__.cpython-312.pyc
+++ b/python/utils/__pycache__/__init__.cpython-312.pyc
--- a/python/utils/__pycache__/helpers.cpython-312.pyc
+++ b/python/utils/__pycache__/helpers.cpython-312.pyc
--- a/python/utils/helpers.py
+++ b/python/utils/helpers.py
@@ -0,0 +1,127 @@
 
				+"""
			
 
				+工具函数模块
			
 
				+
			
 
				+提供常用的辅助功能：
			
 
				+- 命令行执行工具
			
 
				+- 文件处理工具
			
 
				+- 日志工具
			
 
				+"""
			
 
				+
			
 
				+import subprocess
			
 
				+import logging
			
 
				+import re
			
 
				+from typing import Tuple, Optional
			
 
				+
			
 
				+
			
 
				+def run_command(cmd: str, shell: bool = True, timeout: int = 300) -> Tuple[int, str, str]:
			
 
				+    """
			
 
				+    执行命令行命令
			
 
				+    
			
 
				+    Args:
			
 
				+        cmd: 要执行的命令
			
 
				+        shell: 是否使用 shell 执行
			
 
				+        timeout: 超时时间（秒）
			
 
				+        
			
 
				+    Returns:
			
 
				+        (return_code, stdout, stderr)
			
 
				+    """
			
 
				+    try:
			
 
				+        result = subprocess.run(
			
 
				+            cmd,
			
 
				+            shell=shell,
			
 
				+            capture_output=True,
			
 
				+            text=True,
			
 
				+            timeout=timeout
			
 
				+        )
			
 
				+        return result.returncode, result.stdout, result.stderr
			
 
				+    except subprocess.TimeoutExpired:
			
 
				+        return -1, "", f"Command timed out after {timeout} seconds"
			
 
				+    except Exception as e:
			
 
				+        return -1, "", str(e)
			
 
				+
			
 
				+
			
 
				+def validate_hdfs_path(path: str) -> bool:
			
 
				+    """
			
 
				+    验证 HDFS 路径格式是否有效
			
 
				+    
			
 
				+    Args:
			
 
				+        path: 要验证的路径
			
 
				+        
			
 
				+    Returns:
			
 
				+        路径是否有效
			
 
				+    """
			
 
				+    if not path:
			
 
				+        return False
			
 
				+    
			
 
				+    # HDFS 路径必须以 / 开头
			
 
				+    if not path.startswith('/'):
			
 
				+        return False
			
 
				+    
			
 
				+    # 检查是否包含非法字符
			
 
				+    invalid_chars = re.compile(r'[<>:"|?*]')
			
 
				+    if invalid_chars.search(path):
			
 
				+        return False
			
 
				+    
			
 
				+    # 检查是否包含连续的斜杠
			
 
				+    if '//' in path:
			
 
				+        return False
			
 
				+    
			
 
				+    return True
			
 
				+
			
 
				+
			
 
				+def format_file_size(size_bytes: int) -> str:
			
 
				+    """
			
 
				+    格式化文件大小，将字节转换为人类可读的格式
			
 
				+    
			
 
				+    Args:
			
 
				+        size_bytes: 文件大小（字节）
			
 
				+        
			
 
				+    Returns:
			
 
				+        格式化后的文件大小字符串
			
 
				+    """
			
 
				+    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
			
 
				+        if size_bytes < 1024.0:
			
 
				+            return f"{size_bytes:.2f} {unit}"
			
 
				+        size_bytes /= 1024.0
			
 
				+    return f"{size_bytes:.2f} PB"
			
 
				+
			
 
				+
			
 
				+def setup_logger(name: str, level: int = logging.INFO, log_file: Optional[str] = None) -> logging.Logger:
			
 
				+    """
			
 
				+    设置日志器
			
 
				+    
			
 
				+    Args:
			
 
				+        name: 日志器名称
			
 
				+        level: 日志级别
			
 
				+        log_file: 日志文件路径（可选）
			
 
				+        
			
 
				+    Returns:
			
 
				+        配置好的日志器
			
 
				+    """
			
 
				+    logger = logging.getLogger(name)
			
 
				+    logger.setLevel(level)
			
 
				+    
			
 
				+    # 避免重复添加处理器
			
 
				+    if logger.handlers:
			
 
				+        return logger
			
 
				+    
			
 
				+    # 创建格式器
			
 
				+    formatter = logging.Formatter(
			
 
				+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
			
 
				+        datefmt='%Y-%m-%d %H:%M:%S'
			
 
				+    )
			
 
				+    
			
 
				+    # 添加控制台处理器
			
 
				+    console_handler = logging.StreamHandler()
			
 
				+    console_handler.setLevel(level)
			
 
				+    console_handler.setFormatter(formatter)
			
 
				+    logger.addHandler(console_handler)
			
 
				+    
			
 
				+    # 如果指定了日志文件，添加文件处理器
			
 
				+    if log_file:
			
 
				+        file_handler = logging.FileHandler(log_file)
			
 
				+        file_handler.setLevel(level)
			
 
				+        file_handler.setFormatter(formatter)
			
 
				+        logger.addHandler(file_handler)
			
 
				+    
			
 
				+    return logger
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,11 @@
 
				+# Hadoop 相关依赖
			
 
				+# PySpark - 用于现代大数据处理
			
 
				+pyspark>=3.0.0
			
 
				+
			
 
				+# 可选：HDFS 客户端库（如果不想使用命令行工具）
			
 
				+# hdfs>=2.7.0
			
 
				+# pyhdfs>=0.3.0
			
 
				+
			
 
				+# 工具库
			
 
				+click>=7.0  # 用于创建命令行工具
			
 
				+rich>=10.0.0  # 用于美化输出