import hashlib import os from utils.logger_handler import logger from langchain_community.document_loaders import PyPDFLoader, TextLoader from langchain_community.docstore.document import Document from typing import * from utils.path_tool import get_abs_path def get_file_md5_hex(file_path: str) -> str: """获取文件的MD5值""" if not os.path.exists(file_path): logger.error(f"md5计算: 文件不存在: {file_path}") return "" if os.path.isdir(file_path): logger.error(f"md5计算: 不是文件: {file_path}") return "" md5_obj = hashlib.md5() chunk_size = 4096 try: with open(file_path, 'rb') as f: while chunk := f.read(chunk_size): md5_obj.update(chunk) md5_hex = md5_obj.hexdigest() return md5_hex except Exception as e: logger.error(f"md5计算: 读取文件时出错: {file_path}, 错误信息: {str(e)}") return "" def listdir_with_allowed_type(path, allowed_type): """列出目录下所有允许的文件类型""" path = get_abs_path(path) files = [] if not os.path.isdir(path): logger.error(f"listdir_with_allowed_type: 不是目录: {path}") return files for file in os.listdir(path): if file.endswith(allowed_type): files.append(os.path.join(path, file)) return tuple(files) def pdf_loader(file_path: str, pwd: str = None) -> List[Document]: """加载PDF文件""" return PyPDFLoader(file_path, pwd).load() def txt_loader(file_path: str) -> List[Document]: """加载TXT文件""" return TextLoader(file_path, encoding='utf-8').load()