1 month ago · bd8c6c6740
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,4 @@ hs_err_pid*
 
															 /build/
														
 
															 /target/
														
 
															 /bin/
														
 
															+*.pyc
														
--- a/python/mapreduce/wordcount_streaming.py
+++ b/python/mapreduce/wordcount_streaming.py
@@ -1,48 +1,170 @@
 
															 """
														
 
															-Hadoop Streaming 方式的词频统计模块
														
 
															+Hadoop Streaming 方式的词频统计模块 - 现代化版本
														
 
															 对应 Java 版本的 WordCount 类，使用 Hadoop Streaming 方式实现：
														
 
															 - Mapper: 从标准输入读取数据，分割为单词，输出 <单词, 1>
														
 
															 - Reducer: 从标准输入读取 Mapper 输出，统计每个单词的总次数
														
 
															 - Combiner: 可选的本地合并，减少数据传输
														
 
															-使用方式：
														
 
															-1. 作为独立脚本运行（用于 Hadoop Streaming）：
														
 
															-   $ python wordcount_streaming.py mapper < input.txt
														
 
															-   $ python wordcount_streaming.py reducer < mapper_output.txt
														
 
															-
														
 
															-2. 作为模块导入使用：
														
 
															-   from wordcount_streaming import WordCountStreaming
														
 
															-   wc = WordCountStreaming()
														
 
															-   wc.run(input_path, output_path)
														
 
															+现代化特性：
														
 
															+- 配置管理集成
														
 
															+- 改进的错误处理
														
 
															+- 类型安全的数据类
														
 
															+- 增强的统计功能
														
 
															+- 灵活的命令行参数
														
 
															 """
														
 
															 import sys
														
 
															+import os
														
 
															+import json
														
 
															 from collections import defaultdict
														
 
															-from typing import Dict, List, Optional, Tuple
														
 
															-from ..utils.helpers import run_command, setup_logger
														
 
															+from dataclasses import dataclass, field, asdict
														
 
															+from typing import Dict, List, Optional, Tuple, Any, Iterator
														
 
															+from enum import Enum
														
 
															+
														
 
															+from ..utils.helpers import run_command, setup_logger, format_file_size, default_value
														
 
															+from ..config import ConfigurationManager, MapReduceConfig, get_config
														
 
															+
														
 
															+
														
 
															+class OutputFormat(Enum):
														
 
															+    """
														
 
															+    输出格式枚举
														
 
															+    """
														
 
															+    TEXT = 'text'
														
 
															+    JSON = 'json'
														
 
															+    CSV = 'csv'
														
 
															+
														
 
															+
														
 
															+@dataclass
														
 
															+class WordCountResult:
														
 
															+    """
														
 
															+    词频统计结果数据类
														
 
															+    """
														
 
															+    total_words: int = 0
														
 
															+    unique_words: int = 0
														
 
															+    top_words: List[Tuple[str, int]] = field(default_factory=list)
														
 
															+    word_counts: Dict[str, int] = field(default_factory=dict)
														
 
															+    execution_time_ms: float = 0.0
														
 
															+    
														
 
															+    @property
														
 
															+    def most_frequent_word(self) -> Optional[Tuple[str, int]]:
														
 
															+        """
														
 
															+        获取出现频率最高的单词
														
 
															+        
														
 
															+        Returns:
														
 
															+            (单词, 次数) 元组，如果没有单词返回 None
														
 
															+        """
														
 
															+        return self.top_words[0] if self.top_words else None
														
 
															+    
														
 
															+    @property
														
 
															+    def avg_word_frequency(self) -> float:
														
 
															+        """
														
 
															+        计算平均词频
														
 
															+        
														
 
															+        Returns:
														
 
															+            平均每个单词出现的次数
														
 
															+        """
														
 
															+        return self.total_words / self.unique_words if self.unique_words > 0 else 0.0
														
 
															+    
														
 
															+    def to_dict(self) -> Dict[str, Any]:
														
 
															+        """
														
 
															+        转换为字典
														
 
															+        
														
 
															+        Returns:
														
 
															+            字典表示
														
 
															+        """
														
 
															+        data = asdict(self)
														
 
															+        data['most_frequent_word'] = self.most_frequent_word
														
 
															+        data['avg_word_frequency'] = self.avg_word_frequency
														
 
															+        return data
														
 
															+    
														
 
															+    def to_json(self, indent: Optional[int] = None) -> str:
														
 
															+        """
														
 
															+        转换为 JSON 字符串
														
 
															+        
														
 
															+        Args:
														
 
															+            indent: 缩进空格数
														
 
															+            
														
 
															+        Returns:
														
 
															+            JSON 字符串
														
 
															+        """
														
 
															+        return json.dumps(self.to_dict(), ensure_ascii=False, indent=indent)
														
 
															+    
														
 
															+    def save_to_file(self, file_path: str, format: OutputFormat = OutputFormat.JSON):
														
 
															+        """
														
 
															+        保存结果到文件
														
 
															+        
														
 
															+        Args:
														
 
															+            file_path: 文件路径
														
 
															+            format: 输出格式
														
 
															+        """
														
 
															+        os.makedirs(os.path.dirname(os.path.abspath(file_path)), exist_ok=True)
														
 
															+        
														
 
															+        if format == OutputFormat.JSON:
														
 
															+            with open(file_path, 'w', encoding='utf-8') as f:
														
 
															+                json.dump(self.to_dict(), f, ensure_ascii=False, indent=2)
														
 
															+        elif format == OutputFormat.CSV:
														
 
															+            with open(file_path, 'w', encoding='utf-8') as f:
														
 
															+                f.write('word,count\n')
														
 
															+                for word, count in sorted(self.word_counts.items(), key=lambda x: x[1], reverse=True):
														
 
															+                    f.write(f'"{word}",{count}\n')
														
 
															+        else:
														
 
															+            with open(file_path, 'w', encoding='utf-8') as f:
														
 
															+                f.write(f"Total words: {self.total_words}\n")
														
 
															+                f.write(f"Unique words: {self.unique_words}\n")
														
 
															+                f.write(f"Most frequent: {self.most_frequent_word}\n")
														
 
															+                f.write(f"\nTop 20 words:\n")
														
 
															+                for word, count in self.top_words[:20]:
														
 
															+                    f.write(f"{word}: {count}\n")
														
 
															 class WordCountStreaming:
														
 
															     """
														
 
															-    Hadoop Streaming 方式的词频统计类
														
 
															+    Hadoop Streaming 方式的词频统计类 - 现代化版本
														
 
															     封装了 Hadoop Streaming 作业的执行，提供与 Java 版本 WordCount 类类似的功能。
														
 
															+    
														
 
															+    现代化特性：
														
 
															+    - 配置管理集成
														
 
															+    - 改进的错误处理
														
 
															+    - 类型安全的数据类
														
 
															+    - 增强的统计功能
														
 
															+    - 灵活的命令行参数
														
 
															     """
														
 
															-    def __init__(self, hadoop_home: Optional[str] = None, logger_name: str = 'wordcount_streaming'):
														
 
															+    def __init__(self, 
														
 
															+                 hadoop_home: Optional[str] = None, 
														
 
															+                 logger_name: str = 'wordcount_streaming',
														
 
															+                 config: Optional[MapReduceConfig] = None):
														
 
															         """
														
 
															         初始化 WordCountStreaming 实例
														
 
															         Args:
														
 
															             hadoop_home: Hadoop 安装目录（可选，默认从环境变量获取）
														
 
															             logger_name: 日志器名称
														
 
															+            config: MapReduce 配置对象（可选）
														
 
															         """
														
 
															         self.logger = setup_logger(logger_name)
														
 
															-        self.hadoop_home = hadoop_home or __import__('os').environ.get('HADOOP_HOME', '')
														
 
															+        
														
 
															+        # 获取配置
														
 
															+        if config is None:
														
 
															+            config = get_config().mapreduce
														
 
															+        
														
 
															+        self.config = config
														
 
															+        
														
 
															+        # Hadoop 配置
														
 
															+        self.hadoop_home = hadoop_home or os.environ.get('HADOOP_HOME', '')
														
 
															         self.hadoop_cmd = 'hadoop'
														
 
															-    def mapper(self, line: str) -> List[Tuple[str, int]]:
														
 
															+        # 从配置中获取设置
														
 
															+        self.use_combiner = config.use_combiner
														
 
															+        self.num_reducers = config.num_reducers
														
 
															+        self.job_timeout = config.job_timeout
														
 
															+    
														
 
															+    def mapper(self, line: str, 
														
 
															+               case_sensitive: bool = False,
														
 
															+               min_word_length: int = 1,
														
 
															+               stop_words: Optional[List[str]] = None) -> List[Tuple[str, int]]:
														
 
															         """
														
 
															         Mapper 函数：将一行文本分割为单词，输出 <单词, 1>
														
@@ -50,6 +172,9 @@ class WordCountStreaming:
 
															         Args:
														
 
															             line: 输入的一行文本
														
 
															+            case_sensitive: 是否区分大小写（默认 False）
														
 
															+            min_word_length: 最小单词长度（默认 1）
														
 
															+            stop_words: 停用词列表（可选）
														
 
															         Returns:
														
 
															             单词和计数的元组列表
														
@@ -60,13 +185,29 @@ class WordCountStreaming:
 
															             [('hello', 1), ('world', 1), ('hello', 1)]
														
 
															         """
														
 
															         results = []
														
 
															+        
														
 
															         # 分割文本为单词（使用空格、制表符等分隔符）
														
 
															         words = line.strip().split()
														
 
															+        
														
 
															         for word in words:
														
 
															-            # 清理单词（移除标点符号，转为小写）
														
 
															-            word = word.strip('.,!?;:()[]{}"\'').lower()
														
 
															-            if word:  # 确保单词非空
														
 
															+            # 清理单词（移除标点符号）
														
 
															+            word = word.strip('.,!?;:()[]{}"\'')
														
 
															+            
														
 
															+            # 处理大小写
														
 
															+            if not case_sensitive:
														
 
															+                word = word.lower()
														
 
															+            
														
 
															+            # 检查单词长度
														
 
															+            if len(word) < min_word_length:
														
 
															+                continue
														
 
															+            
														
 
															+            # 检查停用词
														
 
															+            if stop_words and word in stop_words:
														
 
															+                continue
														
 
															+            
														
 
															+            if word:
														
 
															                 results.append((word, 1))
														
 
															+        
														
 
															         return results
														
 
															     def combiner(self, pairs: List[Tuple[str, int]]) -> List[Tuple[str, int]]:
														
@@ -112,14 +253,22 @@ class WordCountStreaming:
 
															         total = sum(counts)
														
 
															         return (word, total)
														
 
															-    def run_mapper_from_stdin(self):
														
 
															+    def run_mapper_from_stdin(self, 
														
 
															+                               case_sensitive: bool = False,
														
 
															+                               min_word_length: int = 1,
														
 
															+                               stop_words: Optional[List[str]] = None):
														
 
															         """
														
 
															         从标准输入运行 Mapper（用于 Hadoop Streaming）
														
 
															         从 stdin 读取每行数据，执行 Mapper 逻辑，输出到 stdout。
														
 
															+        
														
 
															+        Args:
														
 
															+            case_sensitive: 是否区分大小写
														
 
															+            min_word_length: 最小单词长度
														
 
															+            stop_words: 停用词列表
														
 
															         """
														
 
															         for line in sys.stdin:
														
 
															-            pairs = self.mapper(line)
														
 
															+            pairs = self.mapper(line, case_sensitive, min_word_length, stop_words)
														
 
															             for word, count in pairs:
														
 
															                 print(f"{word}\t{count}")
														
@@ -170,8 +319,9 @@ class WordCountStreaming:
 
															     def run(self, input_path: str, output_path: str, 
														
 
															             mapper_script: Optional[str] = None,
														
 
															             reducer_script: Optional[str] = None,
														
 
															-            combiner: bool = True,
														
 
															-            num_reducers: int = 1) -> bool:
														
 
															+            combiner: Optional[bool] = None,
														
 
															+            num_reducers: Optional[int] = None,
														
 
															+            timeout: Optional[int] = None) -> bool:
														
 
															         """
														
 
															         运行完整的 WordCount 作业
														
@@ -182,13 +332,21 @@ class WordCountStreaming:
 
															             output_path: HDFS 输出路径（不能已存在）
														
 
															             mapper_script: Mapper 脚本路径（可选，默认使用当前脚本）
														
 
															             reducer_script: Reducer 脚本路径（可选，默认使用当前脚本）
														
 
															-            combiner: 是否使用 Combiner
														
 
															-            num_reducers: Reducer 任务数量
														
 
															+            combiner: 是否使用 Combiner（可选，默认为配置中的设置）
														
 
															+            num_reducers: Reducer 任务数量（可选，默认为配置中的设置）
														
 
															+            timeout: 超时时间（秒，可选，默认为配置中的设置）
														
 
															         Returns:
														
 
															             作业是否成功完成
														
 
															+            
														
 
															+        Raises:
														
 
															+            RuntimeError: 当找不到 Streaming jar 时
														
 
															+            ValueError: 当输入路径无效时
														
 
															         """
														
 
															-        import os
														
 
															+        # 使用配置中的默认值
														
 
															+        use_combiner = default_value(combiner, self.use_combiner)
														
 
															+        reducers = default_value(num_reducers, self.num_reducers)
														
 
															+        job_timeout = default_value(timeout, self.job_timeout)
														
 
															         # 确定脚本路径
														
 
															         if mapper_script is None:
														
@@ -200,7 +358,7 @@ class WordCountStreaming:
 
															         streaming_jar = self._find_streaming_jar()
														
 
															         if not streaming_jar:
														
 
															             self.logger.error("Could not find Hadoop Streaming jar")
														
 
															-            return False
														
 
															+            raise RuntimeError("Hadoop Streaming jar not found")
														
 
															         cmd_parts = [
														
 
															             self.hadoop_cmd,
														
@@ -210,16 +368,16 @@ class WordCountStreaming:
 
															             '-reducer', f"python3 {os.path.basename(reducer_script)} reducer",
														
 
															             '-input', input_path,
														
 
															             '-output', output_path,
														
 
															-            '-D', f"mapreduce.job.reduces={num_reducers}"
														
 
															+            '-D', f"mapreduce.job.reduces={reducers}"
														
 
															         ]
														
 
															-        if combiner:
														
 
															+        if use_combiner:
														
 
															             cmd_parts.extend(['-combiner', f"python3 {os.path.basename(mapper_script)} mapper | sort | python3 {os.path.basename(reducer_script)} reducer"])
														
 
															         cmd = ' '.join(cmd_parts)
														
 
															         self.logger.info(f"Running Hadoop Streaming job: {cmd}")
														
 
															-        returncode, stdout, stderr = run_command(cmd, timeout=3600)  # 1小时超时
														
 
															+        returncode, stdout, stderr = run_command(cmd, timeout=job_timeout)
														
 
															         if returncode == 0:
														
 
															             self.logger.info("WordCount job completed successfully")
														
@@ -237,7 +395,6 @@ class WordCountStreaming:
 
															         Returns:
														
 
															             Streaming jar 文件路径，如果未找到返回 None
														
 
															         """
														
 
															-        import os
														
 
															         import glob
														
 
															         # 尝试从常见位置查找
														
@@ -265,7 +422,11 @@ class WordCountStreaming:
 
															         return None
														
 
															-    def count_words_locally(self, text: str) -> Dict[str, int]:
														
 
															+    def count_words_locally(self, text: str,
														
 
															+                             case_sensitive: bool = False,
														
 
															+                             min_word_length: int = 1,
														
 
															+                             stop_words: Optional[List[str]] = None,
														
 
															+                             top_n: int = 10) -> WordCountResult:
														
 
															         """
														
 
															         本地统计单词（不使用 Hadoop）
														
@@ -273,19 +434,29 @@ class WordCountStreaming:
 
															         Args:
														
 
															             text: 输入文本
														
 
															+            case_sensitive: 是否区分大小写
														
 
															+            min_word_length: 最小单词长度
														
 
															+            stop_words: 停用词列表
														
 
															+            top_n: 返回前 N 个最常见的单词
														
 
															         Returns:
														
 
															-            单词计数字典
														
 
															+            WordCountResult 结果对象
														
 
															         Example:
														
 
															             >>> wc = WordCountStreaming()
														
 
															-            >>> wc.count_words_locally("hello world hello")
														
 
															-            {'hello': 2, 'world': 1}
														
 
															+            >>> result = wc.count_words_locally("hello world hello")
														
 
															+            >>> result.total_words
														
 
															+            3
														
 
															+            >>> result.unique_words
														
 
															+            2
														
 
															         """
														
 
															+        import time
														
 
															+        start_time = time.time()
														
 
															+        
														
 
															         # 模拟完整的 MapReduce 流程
														
 
															         all_pairs = []
														
 
															         for line in text.split('\n'):
														
 
															-            pairs = self.mapper(line)
														
 
															+            pairs = self.mapper(line, case_sensitive, min_word_length, stop_words)
														
 
															             all_pairs.extend(pairs)
														
 
															         # 按单词分组
														
@@ -294,12 +465,109 @@ class WordCountStreaming:
 
															             word_groups[word].append(count)
														
 
															         # 执行 Reduce
														
 
															-        results = {}
														
 
															+        word_counts = {}
														
 
															         for word, counts in word_groups.items():
														
 
															             _, total = self.reducer(word, counts)
														
 
															-            results[word] = total
														
 
															+            word_counts[word] = total
														
 
															-        return results
														
 
															+        # 计算统计信息
														
 
															+        total_words = sum(word_counts.values())
														
 
															+        unique_words = len(word_counts)
														
 
															+        top_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:top_n]
														
 
															+        
														
 
															+        # 计算执行时间
														
 
															+        execution_time_ms = (time.time() - start_time) * 1000
														
 
															+        
														
 
															+        return WordCountResult(
														
 
															+            total_words=total_words,
														
 
															+            unique_words=unique_words,
														
 
															+            top_words=top_words,
														
 
															+            word_counts=word_counts,
														
 
															+            execution_time_ms=execution_time_ms
														
 
															+        )
														
 
															+    
														
 
															+    def analyze_text(self, text: str) -> Dict[str, Any]:
														
 
															+        """
														
 
															+        分析文本的详细统计信息
														
 
															+        
														
 
															+        Args:
														
 
															+            text: 输入文本
														
 
															+            
														
 
															+        Returns:
														
 
															+            包含详细统计信息的字典
														
 
															+        """
														
 
															+        result = self.count_words_locally(text)
														
 
															+        
														
 
															+        # 计算额外的统计信息
														
 
															+        word_lengths = [len(word) for word in result.word_counts.keys()]
														
 
															+        
														
 
															+        analysis = {
														
 
															+            'total_words': result.total_words,
														
 
															+            'unique_words': result.unique_words,
														
 
															+            'most_frequent_word': result.most_frequent_word,
														
 
															+            'avg_word_frequency': result.avg_word_frequency,
														
 
															+            'min_word_length': min(word_lengths) if word_lengths else 0,
														
 
															+            'max_word_length': max(word_lengths) if word_lengths else 0,
														
 
															+            'avg_word_length': sum(word_lengths) / len(word_lengths) if word_lengths else 0,
														
 
															+            'word_frequency_distribution': self._get_frequency_distribution(result.word_counts),
														
 
															+            'top_10_words': result.top_words[:10]
														
 
															+        }
														
 
															+        
														
 
															+        return analysis
														
 
															+    
														
 
															+    def _get_frequency_distribution(self, word_counts: Dict[str, int]) -> Dict[str, int]:
														
 
															+        """
														
 
															+        获取词频分布
														
 
															+        
														
 
															+        Args:
														
 
															+            word_counts: 单词计数字典
														
 
															+            
														
 
															+        Returns:
														
 
															+            词频分布（出现1次的单词数、出现2次的单词数等）
														
 
															+        """
														
 
															+        distribution = defaultdict(int)
														
 
															+        for count in word_counts.values():
														
 
															+            if count == 1:
														
 
															+                distribution['once'] += 1
														
 
															+            elif count <= 5:
														
 
															+                distribution['2-5'] += 1
														
 
															+            elif count <= 10:
														
 
															+                distribution['6-10'] += 1
														
 
															+            else:
														
 
															+                distribution['10+'] += 1
														
 
															+        return dict(distribution)
														
 
															+    
														
 
															+    def get_stop_words(self, language: str = 'english') -> List[str]:
														
 
															+        """
														
 
															+        获取常见停用词列表
														
 
															+        
														
 
															+        Args:
														
 
															+            language: 语言（默认 'english'）
														
 
															+            
														
 
															+        Returns:
														
 
															+            停用词列表
														
 
															+        """
														
 
															+        # 常见英语停用词
														
 
															+        english_stop_words = [
														
 
															+            'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'i',
														
 
															+            'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do',
														
 
															+            'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say',
														
 
															+            'her', 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would',
														
 
															+            'there', 'their', 'what', 'so', 'up', 'out', 'if', 'about',
														
 
															+            'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can',
														
 
															+            'like', 'time', 'no', 'just', 'him', 'know', 'take', 'people',
														
 
															+            'into', 'year', 'your', 'good', 'some', 'could', 'them', 'see',
														
 
															+            'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its',
														
 
															+            'over', 'think', 'also', 'back', 'after', 'use', 'two', 'how',
														
 
															+            'our', 'work', 'first', 'well', 'way', 'even', 'new', 'want',
														
 
															+            'because', 'any', 'these', 'give', 'day', 'most', 'us'
														
 
															+        ]
														
 
															+        
														
 
															+        if language.lower() == 'english':
														
 
															+            return english_stop_words
														
 
															+        else:
														
 
															+            # 默认返回英语停用词
														
 
															+            return english_stop_words
														
 
															 def main():
														
@@ -310,36 +578,189 @@ def main():
 
															     - mapper: 运行 Mapper
														
 
															     - reducer: 运行 Reducer
														
 
															     - local: 本地测试
														
 
															+    - analyze: 分析文本
														
 
															     """
														
 
															-    if len(sys.argv) < 2:
														
 
															-        print("Usage: python wordcount_streaming.py <command>")
														
 
															-        print("Commands:")
														
 
															-        print("  mapper   - Run Mapper from stdin")
														
 
															-        print("  reducer  - Run Reducer from stdin")
														
 
															-        print("  local    - Run local test")
														
 
															+    import argparse
														
 
															+    
														
 
															+    parser = argparse.ArgumentParser(
														
 
															+        description='Hadoop Streaming WordCount (Modern Version)',
														
 
															+        formatter_class=argparse.RawDescriptionHelpFormatter,
														
 
															+        epilog='''
														
 
															+Examples:
														
 
															+  # 从标准输入运行 Mapper
														
 
															+  python wordcount_streaming.py mapper < input.txt
														
 
															+  
														
 
															+  # 从标准输入运行 Reducer
														
 
															+  python wordcount_streaming.py reducer < mapper_output.txt
														
 
															+  
														
 
															+  # 本地测试
														
 
															+  python wordcount_streaming.py local
														
 
															+  
														
 
															+  # 分析文本文件
														
 
															+  python wordcount_streaming.py analyze --file input.txt --stop-words
														
 
															+  
														
 
															+  # 使用增强的 Mapper 参数
														
 
															+  python wordcount_streaming.py mapper --case-sensitive --min-length 3 < input.txt
														
 
															+        '''
														
 
															+    )
														
 
															+    
														
 
															+    # 子命令
														
 
															+    subparsers = parser.add_subparsers(dest='command', help='Available commands')
														
 
															+    
														
 
															+    # mapper 子命令
														
 
															+    mapper_parser = subparsers.add_parser('mapper', help='Run Mapper from stdin')
														
 
															+    mapper_parser.add_argument('--case-sensitive', '-c', action='store_true',
														
 
															+                               help='Case-sensitive word matching')
														
 
															+    mapper_parser.add_argument('--min-length', '-m', type=int, default=1,
														
 
															+                               help='Minimum word length (default: 1)')
														
 
															+    mapper_parser.add_argument('--stop-words', '-s', action='store_true',
														
 
															+                               help='Use common English stop words')
														
 
															+    mapper_parser.add_argument('--stop-words-file', type=str, default=None,
														
 
															+                               help='Custom stop words file path')
														
 
															+    
														
 
															+    # reducer 子命令
														
 
															+    reducer_parser = subparsers.add_parser('reducer', help='Run Reducer from stdin')
														
 
															+    
														
 
															+    # local 子命令
														
 
															+    local_parser = subparsers.add_parser('local', help='Run local test')
														
 
															+    local_parser.add_argument('--file', '-f', type=str, default=None,
														
 
															+                              help='Input file path (default: use sample text)')
														
 
															+    local_parser.add_argument('--case-sensitive', '-c', action='store_true',
														
 
															+                               help='Case-sensitive word matching')
														
 
															+    local_parser.add_argument('--min-length', '-m', type=int, default=1,
														
 
															+                               help='Minimum word length (default: 1)')
														
 
															+    local_parser.add_argument('--stop-words', '-s', action='store_true',
														
 
															+                               help='Use common English stop words')
														
 
															+    local_parser.add_argument('--top-n', '-n', type=int, default=10,
														
 
															+                               help='Number of top words to show (default: 10)')
														
 
															+    local_parser.add_argument('--format', '-fmt', type=str, default='text',
														
 
															+                               choices=['text', 'json', 'csv'],
														
 
															+                               help='Output format (default: text)')
														
 
															+    local_parser.add_argument('--output', '-o', type=str, default=None,
														
 
															+                               help='Output file path (optional)')
														
 
															+    
														
 
															+    # analyze 子命令
														
 
															+    analyze_parser = subparsers.add_parser('analyze', help='Analyze text statistics')
														
 
															+    analyze_parser.add_argument('--file', '-f', type=str, required=True,
														
 
															+                                help='Input file path')
														
 
															+    analyze_parser.add_argument('--format', '-fmt', type=str, default='json',
														
 
															+                               choices=['text', 'json'],
														
 
															+                               help='Output format (default: json)')
														
 
															+    
														
 
															+    args = parser.parse_args()
														
 
															+    
														
 
															+    if not args.command:
														
 
															+        parser.print_help()
														
 
															         sys.exit(1)
														
 
															-    command = sys.argv[1]
														
 
															     wc = WordCountStreaming()
														
 
															-    if command == 'mapper':
														
 
															-        wc.run_mapper_from_stdin()
														
 
															-    elif command == 'reducer':
														
 
															+    # 处理停用词
														
 
															+    stop_words = None
														
 
															+    if hasattr(args, 'stop_words') and args.stop_words:
														
 
															+        stop_words = wc.get_stop_words()
														
 
															+    
														
 
															+    if hasattr(args, 'stop_words_file') and args.stop_words_file:
														
 
															+        if os.path.exists(args.stop_words_file):
														
 
															+            with open(args.stop_words_file, 'r', encoding='utf-8') as f:
														
 
															+                custom_stop_words = [line.strip().lower() for line in f if line.strip()]
														
 
															+            if stop_words is None:
														
 
															+                stop_words = custom_stop_words
														
 
															+            else:
														
 
															+                stop_words.extend(custom_stop_words)
														
 
															+    
														
 
															+    if args.command == 'mapper':
														
 
															+        wc.run_mapper_from_stdin(
														
 
															+            case_sensitive=getattr(args, 'case_sensitive', False),
														
 
															+            min_word_length=getattr(args, 'min_length', 1),
														
 
															+            stop_words=stop_words
														
 
															+        )
														
 
															+    
														
 
															+    elif args.command == 'reducer':
														
 
															         wc.run_reducer_from_stdin()
														
 
															-    elif command == 'local':
														
 
															+    
														
 
															+    elif args.command == 'local':
														
 
															         # 本地测试
														
 
															-        test_text = """
														
 
															-        Hello world, hello Hadoop!
														
 
															-        Hadoop is great for big data.
														
 
															-        Big data processing with Hadoop.
														
 
															-        """
														
 
															-        result = wc.count_words_locally(test_text)
														
 
															-        print("Word count results:")
														
 
															-        for word, count in sorted(result.items(), key=lambda x: x[1], reverse=True):
														
 
															-            print(f"{word}: {count}")
														
 
															-    else:
														
 
															-        print(f"Unknown command: {command}")
														
 
															-        sys.exit(1)
														
 
															+        if args.file and os.path.exists(args.file):
														
 
															+            with open(args.file, 'r', encoding='utf-8') as f:
														
 
															+                test_text = f.read()
														
 
															+        else:
														
 
															+            # 使用示例文本
														
 
															+            test_text = """
														
 
															+            Hello world, hello Hadoop!
														
 
															+            Hadoop is great for big data.
														
 
															+            Big data processing with Hadoop.
														
 
															+            Spark is fast and general engine for large-scale data processing.
														
 
															+            Hadoop provides massive storage for any kind of data.
														
 
															+            """
														
 
															+        
														
 
															+        # 执行统计
														
 
															+        result = wc.count_words_locally(
														
 
															+            test_text,
														
 
															+            case_sensitive=args.case_sensitive,
														
 
															+            min_word_length=args.min_length,
														
 
															+            stop_words=stop_words,
														
 
															+            top_n=args.top_n
														
 
															+        )
														
 
															+        
														
 
															+        # 确定输出格式
														
 
															+        output_format = OutputFormat(args.format)
														
 
															+        
														
 
															+        # 输出结果
														
 
															+        if args.output:
														
 
															+            result.save_to_file(args.output, output_format)
														
 
															+            print(f"Result saved to: {args.output}")
														
 
															+        else:
														
 
															+            if output_format == OutputFormat.JSON:
														
 
															+                print(result.to_json(indent=2))
														
 
															+            elif output_format == OutputFormat.CSV:
														
 
															+                print('word,count')
														
 
															+                for word, count in sorted(result.word_counts.items(), key=lambda x: x[1], reverse=True):
														
 
															+                    print(f'"{word}",{count}')
														
 
															+            else:
														
 
															+                print(f"\n{'='*60}")
														
 
															+                print(f"Word Count Results (Local Mode)")
														
 
															+                print(f"{'='*60}")
														
 
															+                print(f"\nTotal words: {result.total_words:,}")
														
 
															+                print(f"Unique words: {result.unique_words:,}")
														
 
															+                print(f"Execution time: {result.execution_time_ms:.2f} ms")
														
 
															+                
														
 
															+                if result.most_frequent_word:
														
 
															+                    print(f"\nMost frequent word: {result.most_frequent_word[0]} ({result.most_frequent_word[1]} times)")
														
 
															+                
														
 
															+                print(f"\nTop {args.top_n} words:")
														
 
															+                for word, count in result.top_words:
														
 
															+                    print(f"  {word}: {count}")
														
 
															+    
														
 
															+    elif args.command == 'analyze':
														
 
															+        # 分析文本
														
 
															+        with open(args.file, 'r', encoding='utf-8') as f:
														
 
															+            text = f.read()
														
 
															+        
														
 
															+        analysis = wc.analyze_text(text)
														
 
															+        
														
 
															+        if args.format == 'json':
														
 
															+            print(json.dumps(analysis, ensure_ascii=False, indent=2))
														
 
															+        else:
														
 
															+            print(f"\n{'='*60}")
														
 
															+            print(f"Text Analysis Report")
														
 
															+            print(f"{'='*60}")
														
 
															+            print(f"\nFile: {args.file}")
														
 
															+            print(f"\nBasic Statistics:")
														
 
															+            print(f"  Total words: {analysis['total_words']:,}")
														
 
															+            print(f"  Unique words: {analysis['unique_words']:,}")
														
 
															+            print(f"  Most frequent: {analysis['most_frequent_word']}")
														
 
															+            print(f"  Average frequency: {analysis['avg_word_frequency']:.2f}")
														
 
															+            print(f"\nWord Length Statistics:")
														
 
															+            print(f"  Min word length: {analysis['min_word_length']}")
														
 
															+            print(f"  Max word length: {analysis['max_word_length']}")
														
 
															+            print(f"  Average word length: {analysis['avg_word_length']:.2f}")
														
 
															+            print(f"\nWord Frequency Distribution:")
														
 
															+            for key, value in analysis['word_frequency_distribution'].items():
														
 
															+                print(f"  {key}: {value}")
														
 
															+            print(f"\nTop 10 Words:")
														
 
															+            for word, count in analysis['top_10_words']:
														
 
															+                print(f"  {word}: {count}")
														
 
															 if __name__ == '__main__':