agent_proj/utils/file_handler.py

57 lines
1.6 KiB
Python

import hashlib
import os
from utils.logger_handler import logger
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_community.docstore.document import Document
from typing import *
from utils.path_tool import get_abs_path
def get_file_md5_hex(file_path: str) -> str:
"""获取文件的MD5值"""
if not os.path.exists(file_path):
logger.error(f"md5计算: 文件不存在: {file_path}")
return ""
if os.path.isdir(file_path):
logger.error(f"md5计算: 不是文件: {file_path}")
return ""
md5_obj = hashlib.md5()
chunk_size = 4096
try:
with open(file_path, 'rb') as f:
while chunk := f.read(chunk_size):
md5_obj.update(chunk)
md5_hex = md5_obj.hexdigest()
return md5_hex
except Exception as e:
logger.error(f"md5计算: 读取文件时出错: {file_path}, 错误信息: {str(e)}")
return ""
def listdir_with_allowed_type(path, allowed_type):
"""列出目录下所有允许的文件类型"""
path = get_abs_path(path)
files = []
if not os.path.isdir(path):
logger.error(f"listdir_with_allowed_type: 不是目录: {path}")
return files
for file in os.listdir(path):
if file.endswith(allowed_type):
files.append(os.path.join(path, file))
return tuple(files)
def pdf_loader(file_path: str, pwd: str = None) -> List[Document]:
"""加载PDF文件"""
return PyPDFLoader(file_path, pwd).load()
def txt_loader(file_path: str) -> List[Document]:
"""加载TXT文件"""
return TextLoader(file_path, encoding='utf-8').load()