|
|
@@ -1,48 +1,170 @@
|
|
|
"""
|
|
|
-Hadoop Streaming 方式的词频统计模块
|
|
|
+Hadoop Streaming 方式的词频统计模块 - 现代化版本
|
|
|
|
|
|
对应 Java 版本的 WordCount 类,使用 Hadoop Streaming 方式实现:
|
|
|
- Mapper: 从标准输入读取数据,分割为单词,输出 <单词, 1>
|
|
|
- Reducer: 从标准输入读取 Mapper 输出,统计每个单词的总次数
|
|
|
- Combiner: 可选的本地合并,减少数据传输
|
|
|
|
|
|
-使用方式:
|
|
|
-1. 作为独立脚本运行(用于 Hadoop Streaming):
|
|
|
- $ python wordcount_streaming.py mapper < input.txt
|
|
|
- $ python wordcount_streaming.py reducer < mapper_output.txt
|
|
|
-
|
|
|
-2. 作为模块导入使用:
|
|
|
- from wordcount_streaming import WordCountStreaming
|
|
|
- wc = WordCountStreaming()
|
|
|
- wc.run(input_path, output_path)
|
|
|
+现代化特性:
|
|
|
+- 配置管理集成
|
|
|
+- 改进的错误处理
|
|
|
+- 类型安全的数据类
|
|
|
+- 增强的统计功能
|
|
|
+- 灵活的命令行参数
|
|
|
"""
|
|
|
|
|
|
import sys
|
|
|
+import os
|
|
|
+import json
|
|
|
from collections import defaultdict
|
|
|
-from typing import Dict, List, Optional, Tuple
|
|
|
-from ..utils.helpers import run_command, setup_logger
|
|
|
+from dataclasses import dataclass, field, asdict
|
|
|
+from typing import Dict, List, Optional, Tuple, Any, Iterator
|
|
|
+from enum import Enum
|
|
|
+
|
|
|
+from ..utils.helpers import run_command, setup_logger, format_file_size, default_value
|
|
|
+from ..config import ConfigurationManager, MapReduceConfig, get_config
|
|
|
+
|
|
|
+
|
|
|
+class OutputFormat(Enum):
|
|
|
+ """
|
|
|
+ 输出格式枚举
|
|
|
+ """
|
|
|
+ TEXT = 'text'
|
|
|
+ JSON = 'json'
|
|
|
+ CSV = 'csv'
|
|
|
+
|
|
|
+
|
|
|
+@dataclass
|
|
|
+class WordCountResult:
|
|
|
+ """
|
|
|
+ 词频统计结果数据类
|
|
|
+ """
|
|
|
+ total_words: int = 0
|
|
|
+ unique_words: int = 0
|
|
|
+ top_words: List[Tuple[str, int]] = field(default_factory=list)
|
|
|
+ word_counts: Dict[str, int] = field(default_factory=dict)
|
|
|
+ execution_time_ms: float = 0.0
|
|
|
+
|
|
|
+ @property
|
|
|
+ def most_frequent_word(self) -> Optional[Tuple[str, int]]:
|
|
|
+ """
|
|
|
+ 获取出现频率最高的单词
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ (单词, 次数) 元组,如果没有单词返回 None
|
|
|
+ """
|
|
|
+ return self.top_words[0] if self.top_words else None
|
|
|
+
|
|
|
+ @property
|
|
|
+ def avg_word_frequency(self) -> float:
|
|
|
+ """
|
|
|
+ 计算平均词频
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 平均每个单词出现的次数
|
|
|
+ """
|
|
|
+ return self.total_words / self.unique_words if self.unique_words > 0 else 0.0
|
|
|
+
|
|
|
+ def to_dict(self) -> Dict[str, Any]:
|
|
|
+ """
|
|
|
+ 转换为字典
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 字典表示
|
|
|
+ """
|
|
|
+ data = asdict(self)
|
|
|
+ data['most_frequent_word'] = self.most_frequent_word
|
|
|
+ data['avg_word_frequency'] = self.avg_word_frequency
|
|
|
+ return data
|
|
|
+
|
|
|
+ def to_json(self, indent: Optional[int] = None) -> str:
|
|
|
+ """
|
|
|
+ 转换为 JSON 字符串
|
|
|
+
|
|
|
+ Args:
|
|
|
+ indent: 缩进空格数
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ JSON 字符串
|
|
|
+ """
|
|
|
+ return json.dumps(self.to_dict(), ensure_ascii=False, indent=indent)
|
|
|
+
|
|
|
+ def save_to_file(self, file_path: str, format: OutputFormat = OutputFormat.JSON):
|
|
|
+ """
|
|
|
+ 保存结果到文件
|
|
|
+
|
|
|
+ Args:
|
|
|
+ file_path: 文件路径
|
|
|
+ format: 输出格式
|
|
|
+ """
|
|
|
+ os.makedirs(os.path.dirname(os.path.abspath(file_path)), exist_ok=True)
|
|
|
+
|
|
|
+ if format == OutputFormat.JSON:
|
|
|
+ with open(file_path, 'w', encoding='utf-8') as f:
|
|
|
+ json.dump(self.to_dict(), f, ensure_ascii=False, indent=2)
|
|
|
+ elif format == OutputFormat.CSV:
|
|
|
+ with open(file_path, 'w', encoding='utf-8') as f:
|
|
|
+ f.write('word,count\n')
|
|
|
+ for word, count in sorted(self.word_counts.items(), key=lambda x: x[1], reverse=True):
|
|
|
+ f.write(f'"{word}",{count}\n')
|
|
|
+ else:
|
|
|
+ with open(file_path, 'w', encoding='utf-8') as f:
|
|
|
+ f.write(f"Total words: {self.total_words}\n")
|
|
|
+ f.write(f"Unique words: {self.unique_words}\n")
|
|
|
+ f.write(f"Most frequent: {self.most_frequent_word}\n")
|
|
|
+ f.write(f"\nTop 20 words:\n")
|
|
|
+ for word, count in self.top_words[:20]:
|
|
|
+ f.write(f"{word}: {count}\n")
|
|
|
|
|
|
|
|
|
class WordCountStreaming:
|
|
|
"""
|
|
|
- Hadoop Streaming 方式的词频统计类
|
|
|
+ Hadoop Streaming 方式的词频统计类 - 现代化版本
|
|
|
|
|
|
封装了 Hadoop Streaming 作业的执行,提供与 Java 版本 WordCount 类类似的功能。
|
|
|
+
|
|
|
+ 现代化特性:
|
|
|
+ - 配置管理集成
|
|
|
+ - 改进的错误处理
|
|
|
+ - 类型安全的数据类
|
|
|
+ - 增强的统计功能
|
|
|
+ - 灵活的命令行参数
|
|
|
"""
|
|
|
|
|
|
- def __init__(self, hadoop_home: Optional[str] = None, logger_name: str = 'wordcount_streaming'):
|
|
|
+ def __init__(self,
|
|
|
+ hadoop_home: Optional[str] = None,
|
|
|
+ logger_name: str = 'wordcount_streaming',
|
|
|
+ config: Optional[MapReduceConfig] = None):
|
|
|
"""
|
|
|
初始化 WordCountStreaming 实例
|
|
|
|
|
|
Args:
|
|
|
hadoop_home: Hadoop 安装目录(可选,默认从环境变量获取)
|
|
|
logger_name: 日志器名称
|
|
|
+ config: MapReduce 配置对象(可选)
|
|
|
"""
|
|
|
self.logger = setup_logger(logger_name)
|
|
|
- self.hadoop_home = hadoop_home or __import__('os').environ.get('HADOOP_HOME', '')
|
|
|
+
|
|
|
+ # 获取配置
|
|
|
+ if config is None:
|
|
|
+ config = get_config().mapreduce
|
|
|
+
|
|
|
+ self.config = config
|
|
|
+
|
|
|
+ # Hadoop 配置
|
|
|
+ self.hadoop_home = hadoop_home or os.environ.get('HADOOP_HOME', '')
|
|
|
self.hadoop_cmd = 'hadoop'
|
|
|
|
|
|
- def mapper(self, line: str) -> List[Tuple[str, int]]:
|
|
|
+ # 从配置中获取设置
|
|
|
+ self.use_combiner = config.use_combiner
|
|
|
+ self.num_reducers = config.num_reducers
|
|
|
+ self.job_timeout = config.job_timeout
|
|
|
+
|
|
|
+ def mapper(self, line: str,
|
|
|
+ case_sensitive: bool = False,
|
|
|
+ min_word_length: int = 1,
|
|
|
+ stop_words: Optional[List[str]] = None) -> List[Tuple[str, int]]:
|
|
|
"""
|
|
|
Mapper 函数:将一行文本分割为单词,输出 <单词, 1>
|
|
|
|
|
|
@@ -50,6 +172,9 @@ class WordCountStreaming:
|
|
|
|
|
|
Args:
|
|
|
line: 输入的一行文本
|
|
|
+ case_sensitive: 是否区分大小写(默认 False)
|
|
|
+ min_word_length: 最小单词长度(默认 1)
|
|
|
+ stop_words: 停用词列表(可选)
|
|
|
|
|
|
Returns:
|
|
|
单词和计数的元组列表
|
|
|
@@ -60,13 +185,29 @@ class WordCountStreaming:
|
|
|
[('hello', 1), ('world', 1), ('hello', 1)]
|
|
|
"""
|
|
|
results = []
|
|
|
+
|
|
|
# 分割文本为单词(使用空格、制表符等分隔符)
|
|
|
words = line.strip().split()
|
|
|
+
|
|
|
for word in words:
|
|
|
- # 清理单词(移除标点符号,转为小写)
|
|
|
- word = word.strip('.,!?;:()[]{}"\'').lower()
|
|
|
- if word: # 确保单词非空
|
|
|
+ # 清理单词(移除标点符号)
|
|
|
+ word = word.strip('.,!?;:()[]{}"\'')
|
|
|
+
|
|
|
+ # 处理大小写
|
|
|
+ if not case_sensitive:
|
|
|
+ word = word.lower()
|
|
|
+
|
|
|
+ # 检查单词长度
|
|
|
+ if len(word) < min_word_length:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 检查停用词
|
|
|
+ if stop_words and word in stop_words:
|
|
|
+ continue
|
|
|
+
|
|
|
+ if word:
|
|
|
results.append((word, 1))
|
|
|
+
|
|
|
return results
|
|
|
|
|
|
def combiner(self, pairs: List[Tuple[str, int]]) -> List[Tuple[str, int]]:
|
|
|
@@ -112,14 +253,22 @@ class WordCountStreaming:
|
|
|
total = sum(counts)
|
|
|
return (word, total)
|
|
|
|
|
|
- def run_mapper_from_stdin(self):
|
|
|
+ def run_mapper_from_stdin(self,
|
|
|
+ case_sensitive: bool = False,
|
|
|
+ min_word_length: int = 1,
|
|
|
+ stop_words: Optional[List[str]] = None):
|
|
|
"""
|
|
|
从标准输入运行 Mapper(用于 Hadoop Streaming)
|
|
|
|
|
|
从 stdin 读取每行数据,执行 Mapper 逻辑,输出到 stdout。
|
|
|
+
|
|
|
+ Args:
|
|
|
+ case_sensitive: 是否区分大小写
|
|
|
+ min_word_length: 最小单词长度
|
|
|
+ stop_words: 停用词列表
|
|
|
"""
|
|
|
for line in sys.stdin:
|
|
|
- pairs = self.mapper(line)
|
|
|
+ pairs = self.mapper(line, case_sensitive, min_word_length, stop_words)
|
|
|
for word, count in pairs:
|
|
|
print(f"{word}\t{count}")
|
|
|
|
|
|
@@ -170,8 +319,9 @@ class WordCountStreaming:
|
|
|
def run(self, input_path: str, output_path: str,
|
|
|
mapper_script: Optional[str] = None,
|
|
|
reducer_script: Optional[str] = None,
|
|
|
- combiner: bool = True,
|
|
|
- num_reducers: int = 1) -> bool:
|
|
|
+ combiner: Optional[bool] = None,
|
|
|
+ num_reducers: Optional[int] = None,
|
|
|
+ timeout: Optional[int] = None) -> bool:
|
|
|
"""
|
|
|
运行完整的 WordCount 作业
|
|
|
|
|
|
@@ -182,13 +332,21 @@ class WordCountStreaming:
|
|
|
output_path: HDFS 输出路径(不能已存在)
|
|
|
mapper_script: Mapper 脚本路径(可选,默认使用当前脚本)
|
|
|
reducer_script: Reducer 脚本路径(可选,默认使用当前脚本)
|
|
|
- combiner: 是否使用 Combiner
|
|
|
- num_reducers: Reducer 任务数量
|
|
|
+ combiner: 是否使用 Combiner(可选,默认为配置中的设置)
|
|
|
+ num_reducers: Reducer 任务数量(可选,默认为配置中的设置)
|
|
|
+ timeout: 超时时间(秒,可选,默认为配置中的设置)
|
|
|
|
|
|
Returns:
|
|
|
作业是否成功完成
|
|
|
+
|
|
|
+ Raises:
|
|
|
+ RuntimeError: 当找不到 Streaming jar 时
|
|
|
+ ValueError: 当输入路径无效时
|
|
|
"""
|
|
|
- import os
|
|
|
+ # 使用配置中的默认值
|
|
|
+ use_combiner = default_value(combiner, self.use_combiner)
|
|
|
+ reducers = default_value(num_reducers, self.num_reducers)
|
|
|
+ job_timeout = default_value(timeout, self.job_timeout)
|
|
|
|
|
|
# 确定脚本路径
|
|
|
if mapper_script is None:
|
|
|
@@ -200,7 +358,7 @@ class WordCountStreaming:
|
|
|
streaming_jar = self._find_streaming_jar()
|
|
|
if not streaming_jar:
|
|
|
self.logger.error("Could not find Hadoop Streaming jar")
|
|
|
- return False
|
|
|
+ raise RuntimeError("Hadoop Streaming jar not found")
|
|
|
|
|
|
cmd_parts = [
|
|
|
self.hadoop_cmd,
|
|
|
@@ -210,16 +368,16 @@ class WordCountStreaming:
|
|
|
'-reducer', f"python3 {os.path.basename(reducer_script)} reducer",
|
|
|
'-input', input_path,
|
|
|
'-output', output_path,
|
|
|
- '-D', f"mapreduce.job.reduces={num_reducers}"
|
|
|
+ '-D', f"mapreduce.job.reduces={reducers}"
|
|
|
]
|
|
|
|
|
|
- if combiner:
|
|
|
+ if use_combiner:
|
|
|
cmd_parts.extend(['-combiner', f"python3 {os.path.basename(mapper_script)} mapper | sort | python3 {os.path.basename(reducer_script)} reducer"])
|
|
|
|
|
|
cmd = ' '.join(cmd_parts)
|
|
|
self.logger.info(f"Running Hadoop Streaming job: {cmd}")
|
|
|
|
|
|
- returncode, stdout, stderr = run_command(cmd, timeout=3600) # 1小时超时
|
|
|
+ returncode, stdout, stderr = run_command(cmd, timeout=job_timeout)
|
|
|
|
|
|
if returncode == 0:
|
|
|
self.logger.info("WordCount job completed successfully")
|
|
|
@@ -237,7 +395,6 @@ class WordCountStreaming:
|
|
|
Returns:
|
|
|
Streaming jar 文件路径,如果未找到返回 None
|
|
|
"""
|
|
|
- import os
|
|
|
import glob
|
|
|
|
|
|
# 尝试从常见位置查找
|
|
|
@@ -265,7 +422,11 @@ class WordCountStreaming:
|
|
|
|
|
|
return None
|
|
|
|
|
|
- def count_words_locally(self, text: str) -> Dict[str, int]:
|
|
|
+ def count_words_locally(self, text: str,
|
|
|
+ case_sensitive: bool = False,
|
|
|
+ min_word_length: int = 1,
|
|
|
+ stop_words: Optional[List[str]] = None,
|
|
|
+ top_n: int = 10) -> WordCountResult:
|
|
|
"""
|
|
|
本地统计单词(不使用 Hadoop)
|
|
|
|
|
|
@@ -273,19 +434,29 @@ class WordCountStreaming:
|
|
|
|
|
|
Args:
|
|
|
text: 输入文本
|
|
|
+ case_sensitive: 是否区分大小写
|
|
|
+ min_word_length: 最小单词长度
|
|
|
+ stop_words: 停用词列表
|
|
|
+ top_n: 返回前 N 个最常见的单词
|
|
|
|
|
|
Returns:
|
|
|
- 单词计数字典
|
|
|
+ WordCountResult 结果对象
|
|
|
|
|
|
Example:
|
|
|
>>> wc = WordCountStreaming()
|
|
|
- >>> wc.count_words_locally("hello world hello")
|
|
|
- {'hello': 2, 'world': 1}
|
|
|
+ >>> result = wc.count_words_locally("hello world hello")
|
|
|
+ >>> result.total_words
|
|
|
+ 3
|
|
|
+ >>> result.unique_words
|
|
|
+ 2
|
|
|
"""
|
|
|
+ import time
|
|
|
+ start_time = time.time()
|
|
|
+
|
|
|
# 模拟完整的 MapReduce 流程
|
|
|
all_pairs = []
|
|
|
for line in text.split('\n'):
|
|
|
- pairs = self.mapper(line)
|
|
|
+ pairs = self.mapper(line, case_sensitive, min_word_length, stop_words)
|
|
|
all_pairs.extend(pairs)
|
|
|
|
|
|
# 按单词分组
|
|
|
@@ -294,12 +465,109 @@ class WordCountStreaming:
|
|
|
word_groups[word].append(count)
|
|
|
|
|
|
# 执行 Reduce
|
|
|
- results = {}
|
|
|
+ word_counts = {}
|
|
|
for word, counts in word_groups.items():
|
|
|
_, total = self.reducer(word, counts)
|
|
|
- results[word] = total
|
|
|
+ word_counts[word] = total
|
|
|
|
|
|
- return results
|
|
|
+ # 计算统计信息
|
|
|
+ total_words = sum(word_counts.values())
|
|
|
+ unique_words = len(word_counts)
|
|
|
+ top_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:top_n]
|
|
|
+
|
|
|
+ # 计算执行时间
|
|
|
+ execution_time_ms = (time.time() - start_time) * 1000
|
|
|
+
|
|
|
+ return WordCountResult(
|
|
|
+ total_words=total_words,
|
|
|
+ unique_words=unique_words,
|
|
|
+ top_words=top_words,
|
|
|
+ word_counts=word_counts,
|
|
|
+ execution_time_ms=execution_time_ms
|
|
|
+ )
|
|
|
+
|
|
|
+ def analyze_text(self, text: str) -> Dict[str, Any]:
|
|
|
+ """
|
|
|
+ 分析文本的详细统计信息
|
|
|
+
|
|
|
+ Args:
|
|
|
+ text: 输入文本
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 包含详细统计信息的字典
|
|
|
+ """
|
|
|
+ result = self.count_words_locally(text)
|
|
|
+
|
|
|
+ # 计算额外的统计信息
|
|
|
+ word_lengths = [len(word) for word in result.word_counts.keys()]
|
|
|
+
|
|
|
+ analysis = {
|
|
|
+ 'total_words': result.total_words,
|
|
|
+ 'unique_words': result.unique_words,
|
|
|
+ 'most_frequent_word': result.most_frequent_word,
|
|
|
+ 'avg_word_frequency': result.avg_word_frequency,
|
|
|
+ 'min_word_length': min(word_lengths) if word_lengths else 0,
|
|
|
+ 'max_word_length': max(word_lengths) if word_lengths else 0,
|
|
|
+ 'avg_word_length': sum(word_lengths) / len(word_lengths) if word_lengths else 0,
|
|
|
+ 'word_frequency_distribution': self._get_frequency_distribution(result.word_counts),
|
|
|
+ 'top_10_words': result.top_words[:10]
|
|
|
+ }
|
|
|
+
|
|
|
+ return analysis
|
|
|
+
|
|
|
+ def _get_frequency_distribution(self, word_counts: Dict[str, int]) -> Dict[str, int]:
|
|
|
+ """
|
|
|
+ 获取词频分布
|
|
|
+
|
|
|
+ Args:
|
|
|
+ word_counts: 单词计数字典
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 词频分布(出现1次的单词数、出现2次的单词数等)
|
|
|
+ """
|
|
|
+ distribution = defaultdict(int)
|
|
|
+ for count in word_counts.values():
|
|
|
+ if count == 1:
|
|
|
+ distribution['once'] += 1
|
|
|
+ elif count <= 5:
|
|
|
+ distribution['2-5'] += 1
|
|
|
+ elif count <= 10:
|
|
|
+ distribution['6-10'] += 1
|
|
|
+ else:
|
|
|
+ distribution['10+'] += 1
|
|
|
+ return dict(distribution)
|
|
|
+
|
|
|
+ def get_stop_words(self, language: str = 'english') -> List[str]:
|
|
|
+ """
|
|
|
+ 获取常见停用词列表
|
|
|
+
|
|
|
+ Args:
|
|
|
+ language: 语言(默认 'english')
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ 停用词列表
|
|
|
+ """
|
|
|
+ # 常见英语停用词
|
|
|
+ english_stop_words = [
|
|
|
+ 'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'i',
|
|
|
+ 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do',
|
|
|
+ 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say',
|
|
|
+ 'her', 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would',
|
|
|
+ 'there', 'their', 'what', 'so', 'up', 'out', 'if', 'about',
|
|
|
+ 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can',
|
|
|
+ 'like', 'time', 'no', 'just', 'him', 'know', 'take', 'people',
|
|
|
+ 'into', 'year', 'your', 'good', 'some', 'could', 'them', 'see',
|
|
|
+ 'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its',
|
|
|
+ 'over', 'think', 'also', 'back', 'after', 'use', 'two', 'how',
|
|
|
+ 'our', 'work', 'first', 'well', 'way', 'even', 'new', 'want',
|
|
|
+ 'because', 'any', 'these', 'give', 'day', 'most', 'us'
|
|
|
+ ]
|
|
|
+
|
|
|
+ if language.lower() == 'english':
|
|
|
+ return english_stop_words
|
|
|
+ else:
|
|
|
+ # 默认返回英语停用词
|
|
|
+ return english_stop_words
|
|
|
|
|
|
|
|
|
def main():
|
|
|
@@ -310,36 +578,189 @@ def main():
|
|
|
- mapper: 运行 Mapper
|
|
|
- reducer: 运行 Reducer
|
|
|
- local: 本地测试
|
|
|
+ - analyze: 分析文本
|
|
|
"""
|
|
|
- if len(sys.argv) < 2:
|
|
|
- print("Usage: python wordcount_streaming.py <command>")
|
|
|
- print("Commands:")
|
|
|
- print(" mapper - Run Mapper from stdin")
|
|
|
- print(" reducer - Run Reducer from stdin")
|
|
|
- print(" local - Run local test")
|
|
|
+ import argparse
|
|
|
+
|
|
|
+ parser = argparse.ArgumentParser(
|
|
|
+ description='Hadoop Streaming WordCount (Modern Version)',
|
|
|
+ formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
|
+ epilog='''
|
|
|
+Examples:
|
|
|
+ # 从标准输入运行 Mapper
|
|
|
+ python wordcount_streaming.py mapper < input.txt
|
|
|
+
|
|
|
+ # 从标准输入运行 Reducer
|
|
|
+ python wordcount_streaming.py reducer < mapper_output.txt
|
|
|
+
|
|
|
+ # 本地测试
|
|
|
+ python wordcount_streaming.py local
|
|
|
+
|
|
|
+ # 分析文本文件
|
|
|
+ python wordcount_streaming.py analyze --file input.txt --stop-words
|
|
|
+
|
|
|
+ # 使用增强的 Mapper 参数
|
|
|
+ python wordcount_streaming.py mapper --case-sensitive --min-length 3 < input.txt
|
|
|
+ '''
|
|
|
+ )
|
|
|
+
|
|
|
+ # 子命令
|
|
|
+ subparsers = parser.add_subparsers(dest='command', help='Available commands')
|
|
|
+
|
|
|
+ # mapper 子命令
|
|
|
+ mapper_parser = subparsers.add_parser('mapper', help='Run Mapper from stdin')
|
|
|
+ mapper_parser.add_argument('--case-sensitive', '-c', action='store_true',
|
|
|
+ help='Case-sensitive word matching')
|
|
|
+ mapper_parser.add_argument('--min-length', '-m', type=int, default=1,
|
|
|
+ help='Minimum word length (default: 1)')
|
|
|
+ mapper_parser.add_argument('--stop-words', '-s', action='store_true',
|
|
|
+ help='Use common English stop words')
|
|
|
+ mapper_parser.add_argument('--stop-words-file', type=str, default=None,
|
|
|
+ help='Custom stop words file path')
|
|
|
+
|
|
|
+ # reducer 子命令
|
|
|
+ reducer_parser = subparsers.add_parser('reducer', help='Run Reducer from stdin')
|
|
|
+
|
|
|
+ # local 子命令
|
|
|
+ local_parser = subparsers.add_parser('local', help='Run local test')
|
|
|
+ local_parser.add_argument('--file', '-f', type=str, default=None,
|
|
|
+ help='Input file path (default: use sample text)')
|
|
|
+ local_parser.add_argument('--case-sensitive', '-c', action='store_true',
|
|
|
+ help='Case-sensitive word matching')
|
|
|
+ local_parser.add_argument('--min-length', '-m', type=int, default=1,
|
|
|
+ help='Minimum word length (default: 1)')
|
|
|
+ local_parser.add_argument('--stop-words', '-s', action='store_true',
|
|
|
+ help='Use common English stop words')
|
|
|
+ local_parser.add_argument('--top-n', '-n', type=int, default=10,
|
|
|
+ help='Number of top words to show (default: 10)')
|
|
|
+ local_parser.add_argument('--format', '-fmt', type=str, default='text',
|
|
|
+ choices=['text', 'json', 'csv'],
|
|
|
+ help='Output format (default: text)')
|
|
|
+ local_parser.add_argument('--output', '-o', type=str, default=None,
|
|
|
+ help='Output file path (optional)')
|
|
|
+
|
|
|
+ # analyze 子命令
|
|
|
+ analyze_parser = subparsers.add_parser('analyze', help='Analyze text statistics')
|
|
|
+ analyze_parser.add_argument('--file', '-f', type=str, required=True,
|
|
|
+ help='Input file path')
|
|
|
+ analyze_parser.add_argument('--format', '-fmt', type=str, default='json',
|
|
|
+ choices=['text', 'json'],
|
|
|
+ help='Output format (default: json)')
|
|
|
+
|
|
|
+ args = parser.parse_args()
|
|
|
+
|
|
|
+ if not args.command:
|
|
|
+ parser.print_help()
|
|
|
sys.exit(1)
|
|
|
|
|
|
- command = sys.argv[1]
|
|
|
wc = WordCountStreaming()
|
|
|
|
|
|
- if command == 'mapper':
|
|
|
- wc.run_mapper_from_stdin()
|
|
|
- elif command == 'reducer':
|
|
|
+ # 处理停用词
|
|
|
+ stop_words = None
|
|
|
+ if hasattr(args, 'stop_words') and args.stop_words:
|
|
|
+ stop_words = wc.get_stop_words()
|
|
|
+
|
|
|
+ if hasattr(args, 'stop_words_file') and args.stop_words_file:
|
|
|
+ if os.path.exists(args.stop_words_file):
|
|
|
+ with open(args.stop_words_file, 'r', encoding='utf-8') as f:
|
|
|
+ custom_stop_words = [line.strip().lower() for line in f if line.strip()]
|
|
|
+ if stop_words is None:
|
|
|
+ stop_words = custom_stop_words
|
|
|
+ else:
|
|
|
+ stop_words.extend(custom_stop_words)
|
|
|
+
|
|
|
+ if args.command == 'mapper':
|
|
|
+ wc.run_mapper_from_stdin(
|
|
|
+ case_sensitive=getattr(args, 'case_sensitive', False),
|
|
|
+ min_word_length=getattr(args, 'min_length', 1),
|
|
|
+ stop_words=stop_words
|
|
|
+ )
|
|
|
+
|
|
|
+ elif args.command == 'reducer':
|
|
|
wc.run_reducer_from_stdin()
|
|
|
- elif command == 'local':
|
|
|
+
|
|
|
+ elif args.command == 'local':
|
|
|
# 本地测试
|
|
|
- test_text = """
|
|
|
- Hello world, hello Hadoop!
|
|
|
- Hadoop is great for big data.
|
|
|
- Big data processing with Hadoop.
|
|
|
- """
|
|
|
- result = wc.count_words_locally(test_text)
|
|
|
- print("Word count results:")
|
|
|
- for word, count in sorted(result.items(), key=lambda x: x[1], reverse=True):
|
|
|
- print(f"{word}: {count}")
|
|
|
- else:
|
|
|
- print(f"Unknown command: {command}")
|
|
|
- sys.exit(1)
|
|
|
+ if args.file and os.path.exists(args.file):
|
|
|
+ with open(args.file, 'r', encoding='utf-8') as f:
|
|
|
+ test_text = f.read()
|
|
|
+ else:
|
|
|
+ # 使用示例文本
|
|
|
+ test_text = """
|
|
|
+ Hello world, hello Hadoop!
|
|
|
+ Hadoop is great for big data.
|
|
|
+ Big data processing with Hadoop.
|
|
|
+ Spark is fast and general engine for large-scale data processing.
|
|
|
+ Hadoop provides massive storage for any kind of data.
|
|
|
+ """
|
|
|
+
|
|
|
+ # 执行统计
|
|
|
+ result = wc.count_words_locally(
|
|
|
+ test_text,
|
|
|
+ case_sensitive=args.case_sensitive,
|
|
|
+ min_word_length=args.min_length,
|
|
|
+ stop_words=stop_words,
|
|
|
+ top_n=args.top_n
|
|
|
+ )
|
|
|
+
|
|
|
+ # 确定输出格式
|
|
|
+ output_format = OutputFormat(args.format)
|
|
|
+
|
|
|
+ # 输出结果
|
|
|
+ if args.output:
|
|
|
+ result.save_to_file(args.output, output_format)
|
|
|
+ print(f"Result saved to: {args.output}")
|
|
|
+ else:
|
|
|
+ if output_format == OutputFormat.JSON:
|
|
|
+ print(result.to_json(indent=2))
|
|
|
+ elif output_format == OutputFormat.CSV:
|
|
|
+ print('word,count')
|
|
|
+ for word, count in sorted(result.word_counts.items(), key=lambda x: x[1], reverse=True):
|
|
|
+ print(f'"{word}",{count}')
|
|
|
+ else:
|
|
|
+ print(f"\n{'='*60}")
|
|
|
+ print(f"Word Count Results (Local Mode)")
|
|
|
+ print(f"{'='*60}")
|
|
|
+ print(f"\nTotal words: {result.total_words:,}")
|
|
|
+ print(f"Unique words: {result.unique_words:,}")
|
|
|
+ print(f"Execution time: {result.execution_time_ms:.2f} ms")
|
|
|
+
|
|
|
+ if result.most_frequent_word:
|
|
|
+ print(f"\nMost frequent word: {result.most_frequent_word[0]} ({result.most_frequent_word[1]} times)")
|
|
|
+
|
|
|
+ print(f"\nTop {args.top_n} words:")
|
|
|
+ for word, count in result.top_words:
|
|
|
+ print(f" {word}: {count}")
|
|
|
+
|
|
|
+ elif args.command == 'analyze':
|
|
|
+ # 分析文本
|
|
|
+ with open(args.file, 'r', encoding='utf-8') as f:
|
|
|
+ text = f.read()
|
|
|
+
|
|
|
+ analysis = wc.analyze_text(text)
|
|
|
+
|
|
|
+ if args.format == 'json':
|
|
|
+ print(json.dumps(analysis, ensure_ascii=False, indent=2))
|
|
|
+ else:
|
|
|
+ print(f"\n{'='*60}")
|
|
|
+ print(f"Text Analysis Report")
|
|
|
+ print(f"{'='*60}")
|
|
|
+ print(f"\nFile: {args.file}")
|
|
|
+ print(f"\nBasic Statistics:")
|
|
|
+ print(f" Total words: {analysis['total_words']:,}")
|
|
|
+ print(f" Unique words: {analysis['unique_words']:,}")
|
|
|
+ print(f" Most frequent: {analysis['most_frequent_word']}")
|
|
|
+ print(f" Average frequency: {analysis['avg_word_frequency']:.2f}")
|
|
|
+ print(f"\nWord Length Statistics:")
|
|
|
+ print(f" Min word length: {analysis['min_word_length']}")
|
|
|
+ print(f" Max word length: {analysis['max_word_length']}")
|
|
|
+ print(f" Average word length: {analysis['avg_word_length']:.2f}")
|
|
|
+ print(f"\nWord Frequency Distribution:")
|
|
|
+ for key, value in analysis['word_frequency_distribution'].items():
|
|
|
+ print(f" {key}: {value}")
|
|
|
+ print(f"\nTop 10 Words:")
|
|
|
+ for word, count in analysis['top_10_words']:
|
|
|
+ print(f" {word}: {count}")
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|