|
@@ -1,91 +1,206 @@
|
|
|
"""
|
|
"""
|
|
|
HDFS 文件系统操作模块
|
|
HDFS 文件系统操作模块
|
|
|
|
|
|
|
|
-提供与 Java 版本 CommonOperation 类相同的功能:
|
|
|
|
|
|
|
+提供现代化的 HDFS 操作能力:
|
|
|
|
|
+- 多种后端支持(命令行、hdfs 库、pyhdfs 库、WebHDFS)
|
|
|
|
|
+- 同步和异步 API
|
|
|
|
|
+- 上下文管理器支持
|
|
|
|
|
+- 配置管理集成
|
|
|
|
|
+- 重试机制
|
|
|
|
|
+- 丰富的错误处理
|
|
|
|
|
+
|
|
|
|
|
+功能对应 Java 版本 CommonOperation 类:
|
|
|
- 创建目录
|
|
- 创建目录
|
|
|
- 删除目录/文件
|
|
- 删除目录/文件
|
|
|
- 上传文件
|
|
- 上传文件
|
|
|
- 读写文件
|
|
- 读写文件
|
|
|
- 检查文件是否存在
|
|
- 检查文件是否存在
|
|
|
- 列出目录内容
|
|
- 列出目录内容
|
|
|
|
|
+- 获取文件信息
|
|
|
"""
|
|
"""
|
|
|
|
|
|
|
|
import os
|
|
import os
|
|
|
-from typing import List, Optional, Tuple
|
|
|
|
|
-from .utils.helpers import run_command, validate_hdfs_path, setup_logger
|
|
|
|
|
|
|
+import asyncio
|
|
|
|
|
+import time
|
|
|
|
|
+from abc import ABC, abstractmethod
|
|
|
|
|
+from dataclasses import dataclass, field
|
|
|
|
|
+from datetime import datetime
|
|
|
|
|
+from enum import Enum
|
|
|
|
|
+from pathlib import Path
|
|
|
|
|
+from typing import (
|
|
|
|
|
+ Any, Callable, Dict, Generic, List, Optional,
|
|
|
|
|
+ Tuple, Type, TypeVar, Union, Iterator, AsyncIterator
|
|
|
|
|
+)
|
|
|
|
|
+from contextlib import contextmanager, asynccontextmanager
|
|
|
|
|
|
|
|
|
|
+from .config import ConfigurationManager, HDFSConfig, get_config
|
|
|
|
|
+from .utils.helpers import (
|
|
|
|
|
+ run_command, validate_hdfs_path, setup_logger, format_file_size
|
|
|
|
|
+)
|
|
|
|
|
|
|
|
-class HDFSOperations:
|
|
|
|
|
- """
|
|
|
|
|
- HDFS 文件系统操作类
|
|
|
|
|
-
|
|
|
|
|
- 封装了 Hadoop 命令行工具,提供与 HDFS 交互的各种方法。
|
|
|
|
|
- 功能与 Java 版本的 CommonOperation 类相对应。
|
|
|
|
|
- """
|
|
|
|
|
-
|
|
|
|
|
- def __init__(self, hadoop_home: Optional[str] = None, logger_name: str = 'hdfs_operations'):
|
|
|
|
|
- """
|
|
|
|
|
- 初始化 HDFSOperations 实例
|
|
|
|
|
-
|
|
|
|
|
- Args:
|
|
|
|
|
- hadoop_home: Hadoop 安装目录(可选,默认从环境变量获取)
|
|
|
|
|
- logger_name: 日志器名称
|
|
|
|
|
- """
|
|
|
|
|
- self.logger = setup_logger(logger_name)
|
|
|
|
|
- self.hadoop_home = hadoop_home or os.environ.get('HADOOP_HOME', '')
|
|
|
|
|
- self.hadoop_cmd = 'hdfs' if self._check_command_exists('hdfs') else 'hadoop'
|
|
|
|
|
-
|
|
|
|
|
- def _check_command_exists(self, cmd: str) -> bool:
|
|
|
|
|
- """
|
|
|
|
|
- 检查命令是否存在
|
|
|
|
|
-
|
|
|
|
|
- Args:
|
|
|
|
|
- cmd: 命令名称
|
|
|
|
|
-
|
|
|
|
|
- Returns:
|
|
|
|
|
- 命令是否存在
|
|
|
|
|
- """
|
|
|
|
|
- return os.system(f'which {cmd} > /dev/null 2>&1') == 0
|
|
|
|
|
-
|
|
|
|
|
- def _execute_hdfs_command(self, subcommand: str, args: List[str] = None) -> Tuple[int, str, str]:
|
|
|
|
|
- """
|
|
|
|
|
- 执行 HDFS 命令
|
|
|
|
|
-
|
|
|
|
|
- Args:
|
|
|
|
|
- subcommand: HDFS 子命令(如 dfs, fs 等)
|
|
|
|
|
- args: 命令参数列表
|
|
|
|
|
-
|
|
|
|
|
- Returns:
|
|
|
|
|
- (return_code, stdout, stderr)
|
|
|
|
|
- """
|
|
|
|
|
|
|
+
|
|
|
|
|
+T = TypeVar('T')
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+class BackendType(Enum):
|
|
|
|
|
+ """HDFS 后端类型"""
|
|
|
|
|
+ CLI = "cli" # 命令行工具
|
|
|
|
|
+ HDFS_LIB = "hdfs_lib" # hdfs 库
|
|
|
|
|
+ PYHDFS = "pyhdfs" # pyhdfs 库
|
|
|
|
|
+ WEBHDFS = "webhdfs" # WebHDFS REST API
|
|
|
|
|
+ AUTO = "auto" # 自动选择可用的后端
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+@dataclass
|
|
|
|
|
+class FileStatus:
|
|
|
|
|
+ """文件状态信息"""
|
|
|
|
|
+ path: str
|
|
|
|
|
+ is_directory: bool
|
|
|
|
|
+ length: int = 0
|
|
|
|
|
+ replication: int = 1
|
|
|
|
|
+ block_size: int = 134217728 # 128MB 默认块大小
|
|
|
|
|
+ modification_time: Optional[datetime] = None
|
|
|
|
|
+ access_time: Optional[datetime] = None
|
|
|
|
|
+ owner: str = ""
|
|
|
|
|
+ group: str = ""
|
|
|
|
|
+ permission: str = "644"
|
|
|
|
|
+ is_snapshot: bool = False
|
|
|
|
|
+
|
|
|
|
|
+ @property
|
|
|
|
|
+ def size_formatted(self) -> str:
|
|
|
|
|
+ """格式化的文件大小"""
|
|
|
|
|
+ return format_file_size(self.length)
|
|
|
|
|
+
|
|
|
|
|
+ def to_dict(self) -> Dict[str, Any]:
|
|
|
|
|
+ """转换为字典"""
|
|
|
|
|
+ return {
|
|
|
|
|
+ 'path': self.path,
|
|
|
|
|
+ 'is_directory': self.is_directory,
|
|
|
|
|
+ 'length': self.length,
|
|
|
|
|
+ 'size_formatted': self.size_formatted,
|
|
|
|
|
+ 'replication': self.replication,
|
|
|
|
|
+ 'block_size': self.block_size,
|
|
|
|
|
+ 'modification_time': self.modification_time.isoformat() if self.modification_time else None,
|
|
|
|
|
+ 'access_time': self.access_time.isoformat() if self.access_time else None,
|
|
|
|
|
+ 'owner': self.owner,
|
|
|
|
|
+ 'group': self.group,
|
|
|
|
|
+ 'permission': self.permission,
|
|
|
|
|
+ 'is_snapshot': self.is_snapshot,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+class HDFSBackend(ABC):
|
|
|
|
|
+ """HDFS 后端抽象基类"""
|
|
|
|
|
+
|
|
|
|
|
+ def __init__(self, config: HDFSConfig, logger):
|
|
|
|
|
+ self.config = config
|
|
|
|
|
+ self.logger = logger
|
|
|
|
|
+
|
|
|
|
|
+ @abstractmethod
|
|
|
|
|
+ def is_available(self) -> bool:
|
|
|
|
|
+ """检查后端是否可用"""
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ @abstractmethod
|
|
|
|
|
+ def make_dir(self, path: str) -> bool:
|
|
|
|
|
+ """创建目录"""
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ @abstractmethod
|
|
|
|
|
+ def delete(self, path: str, recursive: bool = True) -> bool:
|
|
|
|
|
+ """删除文件或目录"""
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ @abstractmethod
|
|
|
|
|
+ def copy_from_local(self, src: str, dst: str) -> bool:
|
|
|
|
|
+ """从本地上传文件到 HDFS"""
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ @abstractmethod
|
|
|
|
|
+ def copy_to_local(self, src: str, dst: str) -> bool:
|
|
|
|
|
+ """从 HDFS 下载文件到本地"""
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ @abstractmethod
|
|
|
|
|
+ def read_file(self, path: str) -> Optional[str]:
|
|
|
|
|
+ """读取文件内容"""
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ @abstractmethod
|
|
|
|
|
+ def write_file(self, path: str, content: str, overwrite: bool = True) -> bool:
|
|
|
|
|
+ """写入文件内容"""
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ @abstractmethod
|
|
|
|
|
+ def exists(self, path: str) -> bool:
|
|
|
|
|
+ """检查路径是否存在"""
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ @abstractmethod
|
|
|
|
|
+ def list_dir(self, path: str) -> List[str]:
|
|
|
|
|
+ """列出目录内容"""
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ @abstractmethod
|
|
|
|
|
+ def get_file_status(self, path: str) -> Optional[FileStatus]:
|
|
|
|
|
+ """获取文件状态"""
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ @abstractmethod
|
|
|
|
|
+ def get_file_size(self, path: str) -> Optional[int]:
|
|
|
|
|
+ """获取文件大小"""
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ @abstractmethod
|
|
|
|
|
+ def rename(self, src: str, dst: str) -> bool:
|
|
|
|
|
+ """重命名文件或目录"""
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ @abstractmethod
|
|
|
|
|
+ def set_permission(self, path: str, permission: str) -> bool:
|
|
|
|
|
+ """设置文件权限"""
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ @abstractmethod
|
|
|
|
|
+ def set_owner(self, path: str, owner: Optional[str] = None,
|
|
|
|
|
+ group: Optional[str] = None) -> bool:
|
|
|
|
|
+ """设置文件所有者"""
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+class CLIBackend(HDFSBackend):
|
|
|
|
|
+ """命令行后端实现"""
|
|
|
|
|
+
|
|
|
|
|
+ def __init__(self, config: HDFSConfig, logger):
|
|
|
|
|
+ super().__init__(config, logger)
|
|
|
|
|
+ self.hadoop_cmd = config.hadoop_cmd or 'hdfs'
|
|
|
|
|
+ self._check_cmd_available()
|
|
|
|
|
+
|
|
|
|
|
+ def _check_cmd_available(self):
|
|
|
|
|
+ """检查命令是否可用"""
|
|
|
|
|
+ if os.system(f'which {self.hadoop_cmd} > /dev/null 2>&1') != 0:
|
|
|
|
|
+ self.hadoop_cmd = 'hadoop'
|
|
|
|
|
+ if os.system(f'which {self.hadoop_cmd} > /dev/null 2>&1') != 0:
|
|
|
|
|
+ self.logger.warning("Neither 'hdfs' nor 'hadoop' command found")
|
|
|
|
|
+
|
|
|
|
|
+ def is_available(self) -> bool:
|
|
|
|
|
+ """检查后端是否可用"""
|
|
|
|
|
+ return os.system(f'which {self.hadoop_cmd} > /dev/null 2>&1') == 0
|
|
|
|
|
+
|
|
|
|
|
+ def _execute_command(self, subcommand: str, args: List[str] = None) -> Tuple[int, str, str]:
|
|
|
|
|
+ """执行 HDFS 命令"""
|
|
|
args = args or []
|
|
args = args or []
|
|
|
cmd = f"{self.hadoop_cmd} {subcommand} {' '.join(args)}"
|
|
cmd = f"{self.hadoop_cmd} {subcommand} {' '.join(args)}"
|
|
|
self.logger.debug(f"Executing command: {cmd}")
|
|
self.logger.debug(f"Executing command: {cmd}")
|
|
|
return run_command(cmd)
|
|
return run_command(cmd)
|
|
|
|
|
|
|
|
def make_dir(self, path: str) -> bool:
|
|
def make_dir(self, path: str) -> bool:
|
|
|
- """
|
|
|
|
|
- 创建目录
|
|
|
|
|
-
|
|
|
|
|
- 对应 Java 版本的 makeDir 方法。
|
|
|
|
|
-
|
|
|
|
|
- Args:
|
|
|
|
|
- path: 要创建的目录路径
|
|
|
|
|
-
|
|
|
|
|
- Returns:
|
|
|
|
|
- 是否创建成功
|
|
|
|
|
-
|
|
|
|
|
- Example:
|
|
|
|
|
- >>> hdfs = HDFSOperations()
|
|
|
|
|
- >>> hdfs.make_dir('/user/root/test1')
|
|
|
|
|
- True
|
|
|
|
|
- """
|
|
|
|
|
if not validate_hdfs_path(path):
|
|
if not validate_hdfs_path(path):
|
|
|
self.logger.error(f"Invalid HDFS path: {path}")
|
|
self.logger.error(f"Invalid HDFS path: {path}")
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
self.logger.info(f"Creating directory: {path}")
|
|
self.logger.info(f"Creating directory: {path}")
|
|
|
- returncode, stdout, stderr = self._execute_hdfs_command('dfs', ['-mkdir', '-p', path])
|
|
|
|
|
|
|
+ returncode, stdout, stderr = self._execute_command('dfs', ['-mkdir', '-p', path])
|
|
|
|
|
|
|
|
if returncode == 0:
|
|
if returncode == 0:
|
|
|
self.logger.info(f"Successfully created directory: {path}")
|
|
self.logger.info(f"Successfully created directory: {path}")
|
|
@@ -95,23 +210,6 @@ class HDFSOperations:
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
def delete(self, path: str, recursive: bool = True) -> bool:
|
|
def delete(self, path: str, recursive: bool = True) -> bool:
|
|
|
- """
|
|
|
|
|
- 删除文件或目录
|
|
|
|
|
-
|
|
|
|
|
- 对应 Java 版本的 delDir 和 delFile 方法。
|
|
|
|
|
-
|
|
|
|
|
- Args:
|
|
|
|
|
- path: 要删除的路径
|
|
|
|
|
- recursive: 是否递归删除(用于目录)
|
|
|
|
|
-
|
|
|
|
|
- Returns:
|
|
|
|
|
- 是否删除成功
|
|
|
|
|
-
|
|
|
|
|
- Example:
|
|
|
|
|
- >>> hdfs = HDFSOperations()
|
|
|
|
|
- >>> hdfs.delete('/user/hadoop/data/word.txt')
|
|
|
|
|
- True
|
|
|
|
|
- """
|
|
|
|
|
if not validate_hdfs_path(path):
|
|
if not validate_hdfs_path(path):
|
|
|
self.logger.error(f"Invalid HDFS path: {path}")
|
|
self.logger.error(f"Invalid HDFS path: {path}")
|
|
|
return False
|
|
return False
|
|
@@ -120,7 +218,7 @@ class HDFSOperations:
|
|
|
args = ['-rm', '-r'] if recursive else ['-rm']
|
|
args = ['-rm', '-r'] if recursive else ['-rm']
|
|
|
args.append(path)
|
|
args.append(path)
|
|
|
|
|
|
|
|
- returncode, stdout, stderr = self._execute_hdfs_command('dfs', args)
|
|
|
|
|
|
|
+ returncode, stdout, stderr = self._execute_command('dfs', args)
|
|
|
|
|
|
|
|
if returncode == 0:
|
|
if returncode == 0:
|
|
|
self.logger.info(f"Successfully deleted: {path}")
|
|
self.logger.info(f"Successfully deleted: {path}")
|
|
@@ -130,23 +228,6 @@ class HDFSOperations:
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
def copy_from_local(self, src: str, dst: str) -> bool:
|
|
def copy_from_local(self, src: str, dst: str) -> bool:
|
|
|
- """
|
|
|
|
|
- 从本地文件系统上传文件到 HDFS
|
|
|
|
|
-
|
|
|
|
|
- 对应 Java 版本的 putFile 方法。
|
|
|
|
|
-
|
|
|
|
|
- Args:
|
|
|
|
|
- src: 本地文件路径
|
|
|
|
|
- dst: HDFS 目标路径
|
|
|
|
|
-
|
|
|
|
|
- Returns:
|
|
|
|
|
- 是否上传成功
|
|
|
|
|
-
|
|
|
|
|
- Example:
|
|
|
|
|
- >>> hdfs = HDFSOperations()
|
|
|
|
|
- >>> hdfs.copy_from_local('/home/hadoop/word.txt', '/user/hadoop/data/')
|
|
|
|
|
- True
|
|
|
|
|
- """
|
|
|
|
|
if not os.path.exists(src):
|
|
if not os.path.exists(src):
|
|
|
self.logger.error(f"Local file not found: {src}")
|
|
self.logger.error(f"Local file not found: {src}")
|
|
|
return False
|
|
return False
|
|
@@ -156,7 +237,7 @@ class HDFSOperations:
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
self.logger.info(f"Copying from local {src} to HDFS {dst}")
|
|
self.logger.info(f"Copying from local {src} to HDFS {dst}")
|
|
|
- returncode, stdout, stderr = self._execute_hdfs_command('dfs', ['-copyFromLocal', src, dst])
|
|
|
|
|
|
|
+ returncode, stdout, stderr = self._execute_command('dfs', ['-copyFromLocal', src, dst])
|
|
|
|
|
|
|
|
if returncode == 0:
|
|
if returncode == 0:
|
|
|
self.logger.info(f"Successfully copied {src} to {dst}")
|
|
self.logger.info(f"Successfully copied {src} to {dst}")
|
|
@@ -166,22 +247,12 @@ class HDFSOperations:
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
def copy_to_local(self, src: str, dst: str) -> bool:
|
|
def copy_to_local(self, src: str, dst: str) -> bool:
|
|
|
- """
|
|
|
|
|
- 从 HDFS 下载文件到本地文件系统
|
|
|
|
|
-
|
|
|
|
|
- Args:
|
|
|
|
|
- src: HDFS 源路径
|
|
|
|
|
- dst: 本地目标路径
|
|
|
|
|
-
|
|
|
|
|
- Returns:
|
|
|
|
|
- 是否下载成功
|
|
|
|
|
- """
|
|
|
|
|
if not validate_hdfs_path(src):
|
|
if not validate_hdfs_path(src):
|
|
|
self.logger.error(f"Invalid HDFS path: {src}")
|
|
self.logger.error(f"Invalid HDFS path: {src}")
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
self.logger.info(f"Copying from HDFS {src} to local {dst}")
|
|
self.logger.info(f"Copying from HDFS {src} to local {dst}")
|
|
|
- returncode, stdout, stderr = self._execute_hdfs_command('dfs', ['-copyToLocal', src, dst])
|
|
|
|
|
|
|
+ returncode, stdout, stderr = self._execute_command('dfs', ['-copyToLocal', src, dst])
|
|
|
|
|
|
|
|
if returncode == 0:
|
|
if returncode == 0:
|
|
|
self.logger.info(f"Successfully copied {src} to {dst}")
|
|
self.logger.info(f"Successfully copied {src} to {dst}")
|
|
@@ -191,23 +262,6 @@ class HDFSOperations:
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
def read_file(self, path: str) -> Optional[str]:
|
|
def read_file(self, path: str) -> Optional[str]:
|
|
|
- """
|
|
|
|
|
- 读取 HDFS 文件内容
|
|
|
|
|
-
|
|
|
|
|
- 对应 Java 版本的 readFile 方法。
|
|
|
|
|
-
|
|
|
|
|
- Args:
|
|
|
|
|
- path: HDFS 文件路径
|
|
|
|
|
-
|
|
|
|
|
- Returns:
|
|
|
|
|
- 文件内容(字符串),如果失败返回 None
|
|
|
|
|
-
|
|
|
|
|
- Example:
|
|
|
|
|
- >>> hdfs = HDFSOperations()
|
|
|
|
|
- >>> content = hdfs.read_file('/user/hadoop/data/write.txt')
|
|
|
|
|
- >>> print(content)
|
|
|
|
|
- da jia hao,cai shi zhen de hao!
|
|
|
|
|
- """
|
|
|
|
|
if not validate_hdfs_path(path):
|
|
if not validate_hdfs_path(path):
|
|
|
self.logger.error(f"Invalid HDFS path: {path}")
|
|
self.logger.error(f"Invalid HDFS path: {path}")
|
|
|
return None
|
|
return None
|
|
@@ -217,7 +271,7 @@ class HDFSOperations:
|
|
|
return None
|
|
return None
|
|
|
|
|
|
|
|
self.logger.info(f"Reading file: {path}")
|
|
self.logger.info(f"Reading file: {path}")
|
|
|
- returncode, stdout, stderr = self._execute_hdfs_command('dfs', ['-cat', path])
|
|
|
|
|
|
|
+ returncode, stdout, stderr = self._execute_command('dfs', ['-cat', path])
|
|
|
|
|
|
|
|
if returncode == 0:
|
|
if returncode == 0:
|
|
|
self.logger.info(f"Successfully read file: {path}")
|
|
self.logger.info(f"Successfully read file: {path}")
|
|
@@ -227,44 +281,25 @@ class HDFSOperations:
|
|
|
return None
|
|
return None
|
|
|
|
|
|
|
|
def write_file(self, path: str, content: str, overwrite: bool = True) -> bool:
|
|
def write_file(self, path: str, content: str, overwrite: bool = True) -> bool:
|
|
|
- """
|
|
|
|
|
- 写入内容到 HDFS 文件
|
|
|
|
|
-
|
|
|
|
|
- 对应 Java 版本的 writeFile 方法。
|
|
|
|
|
|
|
+ import tempfile
|
|
|
|
|
|
|
|
- Args:
|
|
|
|
|
- path: HDFS 文件路径
|
|
|
|
|
- content: 要写入的内容
|
|
|
|
|
- overwrite: 是否覆盖已存在的文件
|
|
|
|
|
-
|
|
|
|
|
- Returns:
|
|
|
|
|
- 是否写入成功
|
|
|
|
|
-
|
|
|
|
|
- Example:
|
|
|
|
|
- >>> hdfs = HDFSOperations()
|
|
|
|
|
- >>> hdfs.write_file('/user/hadoop/data/write.txt', 'da jia hao,cai shi zhen de hao!')
|
|
|
|
|
- True
|
|
|
|
|
- """
|
|
|
|
|
if not validate_hdfs_path(path):
|
|
if not validate_hdfs_path(path):
|
|
|
self.logger.error(f"Invalid HDFS path: {path}")
|
|
self.logger.error(f"Invalid HDFS path: {path}")
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
self.logger.info(f"Writing to file: {path}")
|
|
self.logger.info(f"Writing to file: {path}")
|
|
|
|
|
|
|
|
- # 创建临时文件
|
|
|
|
|
- import tempfile
|
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as temp_file:
|
|
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as temp_file:
|
|
|
temp_file.write(content)
|
|
temp_file.write(content)
|
|
|
temp_path = temp_file.name
|
|
temp_path = temp_file.name
|
|
|
|
|
|
|
|
try:
|
|
try:
|
|
|
- # 使用 put 命令上传临时文件
|
|
|
|
|
args = ['-put']
|
|
args = ['-put']
|
|
|
if overwrite:
|
|
if overwrite:
|
|
|
args.append('-f')
|
|
args.append('-f')
|
|
|
args.extend([temp_path, path])
|
|
args.extend([temp_path, path])
|
|
|
|
|
|
|
|
- returncode, stdout, stderr = self._execute_hdfs_command('dfs', args)
|
|
|
|
|
|
|
+ returncode, stdout, stderr = self._execute_command('dfs', args)
|
|
|
|
|
|
|
|
if returncode == 0:
|
|
if returncode == 0:
|
|
|
self.logger.info(f"Successfully wrote to file: {path}")
|
|
self.logger.info(f"Successfully wrote to file: {path}")
|
|
@@ -273,36 +308,17 @@ class HDFSOperations:
|
|
|
self.logger.error(f"Failed to write to file: {path}, Error: {stderr}")
|
|
self.logger.error(f"Failed to write to file: {path}, Error: {stderr}")
|
|
|
return False
|
|
return False
|
|
|
finally:
|
|
finally:
|
|
|
- # 清理临时文件
|
|
|
|
|
if os.path.exists(temp_path):
|
|
if os.path.exists(temp_path):
|
|
|
os.unlink(temp_path)
|
|
os.unlink(temp_path)
|
|
|
|
|
|
|
|
def exists(self, path: str) -> bool:
|
|
def exists(self, path: str) -> bool:
|
|
|
- """
|
|
|
|
|
- 检查 HDFS 路径是否存在
|
|
|
|
|
-
|
|
|
|
|
- Args:
|
|
|
|
|
- path: HDFS 路径
|
|
|
|
|
-
|
|
|
|
|
- Returns:
|
|
|
|
|
- 路径是否存在
|
|
|
|
|
- """
|
|
|
|
|
if not validate_hdfs_path(path):
|
|
if not validate_hdfs_path(path):
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
- returncode, _, _ = self._execute_hdfs_command('dfs', ['-test', '-e', path])
|
|
|
|
|
|
|
+ returncode, _, _ = self._execute_command('dfs', ['-test', '-e', path])
|
|
|
return returncode == 0
|
|
return returncode == 0
|
|
|
|
|
|
|
|
def list_dir(self, path: str) -> List[str]:
|
|
def list_dir(self, path: str) -> List[str]:
|
|
|
- """
|
|
|
|
|
- 列出 HDFS 目录内容
|
|
|
|
|
-
|
|
|
|
|
- Args:
|
|
|
|
|
- path: HDFS 目录路径
|
|
|
|
|
-
|
|
|
|
|
- Returns:
|
|
|
|
|
- 目录内容列表
|
|
|
|
|
- """
|
|
|
|
|
if not validate_hdfs_path(path):
|
|
if not validate_hdfs_path(path):
|
|
|
self.logger.error(f"Invalid HDFS path: {path}")
|
|
self.logger.error(f"Invalid HDFS path: {path}")
|
|
|
return []
|
|
return []
|
|
@@ -311,16 +327,13 @@ class HDFSOperations:
|
|
|
self.logger.error(f"Directory does not exist: {path}")
|
|
self.logger.error(f"Directory does not exist: {path}")
|
|
|
return []
|
|
return []
|
|
|
|
|
|
|
|
- returncode, stdout, stderr = self._execute_hdfs_command('dfs', ['-ls', path])
|
|
|
|
|
|
|
+ returncode, stdout, stderr = self._execute_command('dfs', ['-ls', path])
|
|
|
|
|
|
|
|
if returncode == 0:
|
|
if returncode == 0:
|
|
|
- # 解析输出,提取文件名
|
|
|
|
|
lines = stdout.strip().split('\n')
|
|
lines = stdout.strip().split('\n')
|
|
|
- # 跳过第一行(如果是目录列表的标题)
|
|
|
|
|
if len(lines) > 0 and lines[0].startswith('Found'):
|
|
if len(lines) > 0 and lines[0].startswith('Found'):
|
|
|
lines = lines[1:]
|
|
lines = lines[1:]
|
|
|
|
|
|
|
|
- # 提取文件名(每一行的最后一个字段)
|
|
|
|
|
files = []
|
|
files = []
|
|
|
for line in lines:
|
|
for line in lines:
|
|
|
parts = line.split()
|
|
parts = line.split()
|
|
@@ -331,16 +344,58 @@ class HDFSOperations:
|
|
|
self.logger.error(f"Failed to list directory: {path}, Error: {stderr}")
|
|
self.logger.error(f"Failed to list directory: {path}, Error: {stderr}")
|
|
|
return []
|
|
return []
|
|
|
|
|
|
|
|
- def get_file_size(self, path: str) -> Optional[int]:
|
|
|
|
|
- """
|
|
|
|
|
- 获取 HDFS 文件大小
|
|
|
|
|
|
|
+ def get_file_status(self, path: str) -> Optional[FileStatus]:
|
|
|
|
|
+ if not validate_hdfs_path(path):
|
|
|
|
|
+ self.logger.error(f"Invalid HDFS path: {path}")
|
|
|
|
|
+ return None
|
|
|
|
|
|
|
|
- Args:
|
|
|
|
|
- path: HDFS 文件路径
|
|
|
|
|
-
|
|
|
|
|
- Returns:
|
|
|
|
|
- 文件大小(字节),如果失败返回 None
|
|
|
|
|
- """
|
|
|
|
|
|
|
+ if not self.exists(path):
|
|
|
|
|
+ self.logger.error(f"Path does not exist: {path}")
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+ returncode, stdout, stderr = self._execute_command('dfs', ['-stat', '%F,%s,%r,%b,%y,%z,%u,%g,%a', path])
|
|
|
|
|
+
|
|
|
|
|
+ if returncode == 0:
|
|
|
|
|
+ parts = stdout.strip().split(',')
|
|
|
|
|
+ if len(parts) >= 9:
|
|
|
|
|
+ is_dir = parts[0] == 'directory'
|
|
|
|
|
+ try:
|
|
|
|
|
+ return FileStatus(
|
|
|
|
|
+ path=path,
|
|
|
|
|
+ is_directory=is_dir,
|
|
|
|
|
+ length=int(parts[1]) if parts[1] else 0,
|
|
|
|
|
+ replication=int(parts[2]) if parts[2] else 1,
|
|
|
|
|
+ block_size=int(parts[3]) if parts[3] else 134217728,
|
|
|
|
|
+ modification_time=datetime.strptime(parts[4], '%Y-%m-%d %H:%M:%S') if parts[4] else None,
|
|
|
|
|
+ access_time=datetime.strptime(parts[5], '%Y-%m-%d %H:%M:%S') if parts[5] else None,
|
|
|
|
|
+ owner=parts[6],
|
|
|
|
|
+ group=parts[7],
|
|
|
|
|
+ permission=parts[8],
|
|
|
|
|
+ )
|
|
|
|
|
+ except (ValueError, IndexError) as e:
|
|
|
|
|
+ self.logger.warning(f"Failed to parse file status: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ # 备用方法:使用 -ls
|
|
|
|
|
+ returncode, stdout, stderr = self._execute_command('dfs', ['-ls', '-d', path])
|
|
|
|
|
+ if returncode == 0:
|
|
|
|
|
+ parts = stdout.strip().split()
|
|
|
|
|
+ if len(parts) >= 8:
|
|
|
|
|
+ is_dir = parts[0].startswith('d')
|
|
|
|
|
+ try:
|
|
|
|
|
+ return FileStatus(
|
|
|
|
|
+ path=path,
|
|
|
|
|
+ is_directory=is_dir,
|
|
|
|
|
+ length=int(parts[4]) if parts[4] else 0,
|
|
|
|
|
+ replication=int(parts[1]) if parts[1] and not is_dir else 1,
|
|
|
|
|
+ owner=parts[2],
|
|
|
|
|
+ group=parts[3],
|
|
|
|
|
+ )
|
|
|
|
|
+ except (ValueError, IndexError):
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+ def get_file_size(self, path: str) -> Optional[int]:
|
|
|
if not validate_hdfs_path(path):
|
|
if not validate_hdfs_path(path):
|
|
|
self.logger.error(f"Invalid HDFS path: {path}")
|
|
self.logger.error(f"Invalid HDFS path: {path}")
|
|
|
return None
|
|
return None
|
|
@@ -349,17 +404,97 @@ class HDFSOperations:
|
|
|
self.logger.error(f"File does not exist: {path}")
|
|
self.logger.error(f"File does not exist: {path}")
|
|
|
return None
|
|
return None
|
|
|
|
|
|
|
|
- returncode, stdout, stderr = self._execute_hdfs_command('dfs', ['-du', '-s', path])
|
|
|
|
|
|
|
+ returncode, stdout, stderr = self._execute_command('dfs', ['-du', '-s', path])
|
|
|
|
|
|
|
|
if returncode == 0:
|
|
if returncode == 0:
|
|
|
- # 解析输出,提取文件大小
|
|
|
|
|
parts = stdout.strip().split()
|
|
parts = stdout.strip().split()
|
|
|
if len(parts) >= 1:
|
|
if len(parts) >= 1:
|
|
|
try:
|
|
try:
|
|
|
return int(parts[0])
|
|
return int(parts[0])
|
|
|
except ValueError:
|
|
except ValueError:
|
|
|
self.logger.error(f"Failed to parse file size: {stdout}")
|
|
self.logger.error(f"Failed to parse file size: {stdout}")
|
|
|
- return None
|
|
|
|
|
|
|
+
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+ def rename(self, src: str, dst: str) -> bool:
|
|
|
|
|
+ if not validate_hdfs_path(src):
|
|
|
|
|
+ self.logger.error(f"Invalid source path: {src}")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ if not validate_hdfs_path(dst):
|
|
|
|
|
+ self.logger.error(f"Invalid destination path: {dst}")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ self.logger.info(f"Renaming {src} to {dst}")
|
|
|
|
|
+ returncode, stdout, stderr = self._execute_command('dfs', ['-mv', src, dst])
|
|
|
|
|
+
|
|
|
|
|
+ if returncode == 0:
|
|
|
|
|
+ self.logger.info(f"Successfully renamed {src} to {dst}")
|
|
|
|
|
+ return True
|
|
|
else:
|
|
else:
|
|
|
- self.logger.error(f"Failed to get file size: {path}, Error: {stderr}")
|
|
|
|
|
- return None
|
|
|
|
|
|
|
+ self.logger.error(f"Failed to rename {src} to {dst}, Error: {stderr}")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ def set_permission(self, path: str, permission: str) -> bool:
|
|
|
|
|
+ if not validate_hdfs_path(path):
|
|
|
|
|
+ self.logger.error(f"Invalid HDFS path: {path}")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ self.logger.info(f"Setting permission of {path} to {permission}")
|
|
|
|
|
+ returncode, stdout, stderr = self._execute_command('dfs', ['-chmod', permission, path])
|
|
|
|
|
+
|
|
|
|
|
+ if returncode == 0:
|
|
|
|
|
+ self.logger.info(f"Successfully set permission of {path} to {permission}")
|
|
|
|
|
+ return True
|
|
|
|
|
+ else:
|
|
|
|
|
+ self.logger.error(f"Failed to set permission of {path}, Error: {stderr}")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ def set_owner(self, path: str, owner: Optional[str] = None,
|
|
|
|
|
+ group: Optional[str] = None) -> bool:
|
|
|
|
|
+ if not validate_hdfs_path(path):
|
|
|
|
|
+ self.logger.error(f"Invalid HDFS path: {path}")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ if owner is None and group is None:
|
|
|
|
|
+ self.logger.error("At least one of owner or group must be specified")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ owner_str = owner if owner else ''
|
|
|
|
|
+ group_str = f":{group}" if group else ''
|
|
|
|
|
+ owner_group = f"{owner_str}{group_str}"
|
|
|
|
|
+
|
|
|
|
|
+ self.logger.info(f"Setting owner of {path} to {owner_group}")
|
|
|
|
|
+ returncode, stdout, stderr = self._execute_command('dfs', ['-chown', owner_group, path])
|
|
|
|
|
+
|
|
|
|
|
+ if returncode == 0:
|
|
|
|
|
+ self.logger.info(f"Successfully set owner of {path} to {owner_group}")
|
|
|
|
|
+ return True
|
|
|
|
|
+ else:
|
|
|
|
|
+ self.logger.error(f"Failed to set owner of {path}, Error: {stderr}")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+class WebHDFSBackend(HDFSBackend):
|
|
|
|
|
+ """WebHDFS REST API 后端实现"""
|
|
|
|
|
+
|
|
|
|
|
+ def __init__(self, config: HDFSConfig, logger):
|
|
|
|
|
+ super().__init__(config, logger)
|
|
|
|
|
+ self.base_url = f"http://{config.namenode_host}:{config.namenode_http_port}/webhdfs/v1"
|
|
|
|
|
+ self.user = config.user or os.environ.get('USER', 'hadoop')
|
|
|
|
|
+ self._session = None
|
|
|
|
|
+
|
|
|
|
|
+ def _get_session(self):
|
|
|
|
|
+ """获取 HTTP 会话"""
|
|
|
|
|
+ if self._session is None:
|
|
|
|
|
+ try:
|
|
|
|
|
+ import requests
|
|
|
|
|
+ self._session = requests.Session()
|
|
|
|
|
+ except ImportError:
|
|
|
|
|
+ self.logger.error("requests library is required for WebHDFS backend")
|
|
|
|
|
+ raise
|
|
|
|
|
+ return self._session
|
|
|
|
|
+
|
|
|
|
|
+ def is_available(self) -> bool:
|
|
|
|
|
+ """检查后端是否可用"""
|
|
|
|
|
+ try:
|