diff --git a/.idea/agent_proj.iml b/.idea/agent_proj.iml index 5e376b1..b9ed519 100644 --- a/.idea/agent_proj.iml +++ b/.idea/agent_proj.iml @@ -2,7 +2,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index f86cb7f..1511473 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,5 +3,5 @@ - + \ No newline at end of file diff --git a/logs/Agent_20260228.log b/logs/Agent_20260228.log index 3209a45..acb2335 100644 --- a/logs/Agent_20260228.log +++ b/logs/Agent_20260228.log @@ -81,3 +81,162 @@ pypdf.errors.PdfReadError: Not an encrypted file 2026-02-28 03:36:40,073 - Agent - INFO - vector_store.py:70 - [加载知识库]: /Users/czzhangheng/edu/agent_proj/data/扫拖一体机器人100问.txt 的 md5已存在, 跳过 2026-02-28 03:36:40,074 - Agent - INFO - vector_store.py:70 - [加载知识库]: /Users/czzhangheng/edu/agent_proj/data/维护保养.txt 的 md5已存在, 跳过 2026-02-28 03:36:51,201 - Agent - INFO - vector_store.py:88 - [加载知识库]/Users/czzhangheng/edu/agent_proj/data/扫地机器人100问.pdf 加载成功 +2026-02-28 13:09:38,673 - Agent - INFO - logger_handler.py:47 - info +2026-02-28 13:09:38,673 - Agent - ERROR - logger_handler.py:48 - Error +2026-02-28 13:09:52,924 - Agent - ERROR - vector_store.py:90 - [加载知识库]C:\Users\cz189\agent_proj\data\扫地机器人100问.pdf 失败,错误 `pypdf` package not found, please install it with `pip install pypdf` +Traceback (most recent call last): + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\parsers\pdf.py", line 359, in lazy_parse + import pypdf +ModuleNotFoundError: No module named 'pypdf' + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 74, in load_document + document: list[Documents] = get_file_document(path) + ^^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 58, in get_file_document + return pdf_loader(read_path) + ^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\cz189\agent_proj\utils\file_handler.py", line 51, in pdf_loader + return PyPDFLoader(file_path, pwd).load() + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_core\document_loaders\base.py", line 43, in load + return list(self.lazy_load()) + ^^^^^^^^^^^^^^^^^^^^^^ + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\pdf.py", line 305, in lazy_load + yield from self.parser.lazy_parse(blob) + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\parsers\pdf.py", line 361, in lazy_parse + raise ImportError( +ImportError: `pypdf` package not found, please install it with `pip install pypdf` +2026-02-28 13:09:52,928 - Agent - ERROR - vector_store.py:90 - [加载知识库]C:\Users\cz189\agent_proj\data\扫地机器人100问2.txt 失败,错误 Error loading C:\Users\cz189\agent_proj\data\扫地机器人100问2.txt +Traceback (most recent call last): + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 43, in lazy_load + text = f.read() + ^^^^^^^^ +UnicodeDecodeError: 'gbk' codec can't decode byte 0xab in position 4: illegal multibyte sequence + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 74, in load_document + document: list[Documents] = get_file_document(path) + ^^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 55, in get_file_document + return txt_loader(read_path) + ^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\cz189\agent_proj\utils\file_handler.py", line 56, in txt_loader + return TextLoader(file_path).load() + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_core\document_loaders\base.py", line 43, in load + return list(self.lazy_load()) + ^^^^^^^^^^^^^^^^^^^^^^ + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 56, in lazy_load + raise RuntimeError(f"Error loading {self.file_path}") from e +RuntimeError: Error loading C:\Users\cz189\agent_proj\data\扫地机器人100问2.txt +2026-02-28 13:09:52,930 - Agent - ERROR - vector_store.py:90 - [加载知识库]C:\Users\cz189\agent_proj\data\扫拖一体机器人100问.txt 失败,错误 Error loading C:\Users\cz189\agent_proj\data\扫拖一体机器人100问.txt +Traceback (most recent call last): + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 43, in lazy_load + text = f.read() + ^^^^^^^^ +UnicodeDecodeError: 'gbk' codec can't decode byte 0xab in position 4: illegal multibyte sequence + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 74, in load_document + document: list[Documents] = get_file_document(path) + ^^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 55, in get_file_document + return txt_loader(read_path) + ^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\cz189\agent_proj\utils\file_handler.py", line 56, in txt_loader + return TextLoader(file_path).load() + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_core\document_loaders\base.py", line 43, in load + return list(self.lazy_load()) + ^^^^^^^^^^^^^^^^^^^^^^ + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 56, in lazy_load + raise RuntimeError(f"Error loading {self.file_path}") from e +RuntimeError: Error loading C:\Users\cz189\agent_proj\data\扫拖一体机器人100问.txt +2026-02-28 13:09:52,933 - Agent - ERROR - vector_store.py:90 - [加载知识库]C:\Users\cz189\agent_proj\data\故障排除.txt 失败,错误 Error loading C:\Users\cz189\agent_proj\data\故障排除.txt +Traceback (most recent call last): + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 43, in lazy_load + text = f.read() + ^^^^^^^^ +UnicodeDecodeError: 'gbk' codec can't decode byte 0xab in position 2: illegal multibyte sequence + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 74, in load_document + document: list[Documents] = get_file_document(path) + ^^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 55, in get_file_document + return txt_loader(read_path) + ^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\cz189\agent_proj\utils\file_handler.py", line 56, in txt_loader + return TextLoader(file_path).load() + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_core\document_loaders\base.py", line 43, in load + return list(self.lazy_load()) + ^^^^^^^^^^^^^^^^^^^^^^ + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 56, in lazy_load + raise RuntimeError(f"Error loading {self.file_path}") from e +RuntimeError: Error loading C:\Users\cz189\agent_proj\data\故障排除.txt +2026-02-28 13:09:52,934 - Agent - ERROR - vector_store.py:90 - [加载知识库]C:\Users\cz189\agent_proj\data\维护保养.txt 失败,错误 Error loading C:\Users\cz189\agent_proj\data\维护保养.txt +Traceback (most recent call last): + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 43, in lazy_load + text = f.read() + ^^^^^^^^ +UnicodeDecodeError: 'gbk' codec can't decode byte 0xab in position 4: illegal multibyte sequence + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 74, in load_document + document: list[Documents] = get_file_document(path) + ^^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 55, in get_file_document + return txt_loader(read_path) + ^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\cz189\agent_proj\utils\file_handler.py", line 56, in txt_loader + return TextLoader(file_path).load() + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_core\document_loaders\base.py", line 43, in load + return list(self.lazy_load()) + ^^^^^^^^^^^^^^^^^^^^^^ + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 56, in lazy_load + raise RuntimeError(f"Error loading {self.file_path}") from e +RuntimeError: Error loading C:\Users\cz189\agent_proj\data\维护保养.txt +2026-02-28 13:09:52,936 - Agent - ERROR - vector_store.py:90 - [加载知识库]C:\Users\cz189\agent_proj\data\选购指南.txt 失败,错误 Error loading C:\Users\cz189\agent_proj\data\选购指南.txt +Traceback (most recent call last): + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 43, in lazy_load + text = f.read() + ^^^^^^^^ +UnicodeDecodeError: 'gbk' codec can't decode byte 0xab in position 2: illegal multibyte sequence + +The above exception was the direct cause of the following exception: + +Traceback (most recent call last): + File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 74, in load_document + document: list[Documents] = get_file_document(path) + ^^^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 55, in get_file_document + return txt_loader(read_path) + ^^^^^^^^^^^^^^^^^^^^^ + File "C:\Users\cz189\agent_proj\utils\file_handler.py", line 56, in txt_loader + return TextLoader(file_path).load() + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_core\document_loaders\base.py", line 43, in load + return list(self.lazy_load()) + ^^^^^^^^^^^^^^^^^^^^^^ + File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 56, in lazy_load + raise RuntimeError(f"Error loading {self.file_path}") from e +RuntimeError: Error loading C:\Users\cz189\agent_proj\data\选购指南.txt +2026-02-28 13:12:19,618 - Agent - INFO - vector_store.py:88 - [加载知识库]C:\Users\cz189\agent_proj\data\扫地机器人100问.pdf 加载成功 +2026-02-28 13:12:22,955 - Agent - INFO - vector_store.py:88 - [加载知识库]C:\Users\cz189\agent_proj\data\扫地机器人100问2.txt 加载成功 +2026-02-28 13:12:27,114 - Agent - INFO - vector_store.py:88 - [加载知识库]C:\Users\cz189\agent_proj\data\扫拖一体机器人100问.txt 加载成功 +2026-02-28 13:12:34,245 - Agent - INFO - vector_store.py:88 - [加载知识库]C:\Users\cz189\agent_proj\data\故障排除.txt 加载成功 +2026-02-28 13:12:38,453 - Agent - INFO - vector_store.py:88 - [加载知识库]C:\Users\cz189\agent_proj\data\维护保养.txt 加载成功 +2026-02-28 13:12:42,384 - Agent - INFO - vector_store.py:88 - [加载知识库]C:\Users\cz189\agent_proj\data\选购指南.txt 加载成功 diff --git a/md5.txt b/md5.txt index 1258035..71cda04 100644 --- a/md5.txt +++ b/md5.txt @@ -1,5 +1,6 @@ -94bd6c0b4283b8524f4e8dfc5b72f8ab -a29a4ba5a539a2a462def8f91eb98ccc -928c24dbdeeb1c0a109f5620c8bac429 -d294281a5769e0a3d1455fbf637ee572 -fc539588792a89ff842cf241cf822de16d726f4604d9e7ebcf59a932ae9e9c00 +6d726f4604d9e7ebcf59a932ae9e9c00 +4ef62dc4189785055436124361296d05 +76d81647e3901472e01ad9859425506f +85fd3b64ff87fe08fdc5c4a34998b555 +96a099807d1ad7c26e37b53d130f26d2 +1ce3fc50860e15d1c106900b63ef42a2 diff --git a/model/__pycache__/factory.cpython-311.pyc b/model/__pycache__/factory.cpython-311.pyc new file mode 100644 index 0000000..d958fac Binary files /dev/null and b/model/__pycache__/factory.cpython-311.pyc differ diff --git a/rag/__pycache__/vector_store.cpython-311.pyc b/rag/__pycache__/vector_store.cpython-311.pyc new file mode 100644 index 0000000..f978c01 Binary files /dev/null and b/rag/__pycache__/vector_store.cpython-311.pyc differ diff --git a/rag/chroma_db/chroma.sqlite3 b/rag/chroma_db/chroma.sqlite3 index ee90f19..0c9fd39 100644 Binary files a/rag/chroma_db/chroma.sqlite3 and b/rag/chroma_db/chroma.sqlite3 differ diff --git a/rag/rag_service.py b/rag/rag_service.py new file mode 100644 index 0000000..049da9b --- /dev/null +++ b/rag/rag_service.py @@ -0,0 +1,47 @@ +"""rag总结服务:用户提问,搜索参考资料,提问+参考资料提供模型,让模型总结回复""" +from langchain_core.output_parsers import StrOutputParser + +from rag.vector_store import VectorStoreSerivce +from utils.prompt_loader import load_rag_prompts +from langchain_core.prompts import PromptTemplate +from model.factory import chat_model +from langchain_community.docstore.document import Document +from typing import List + +class RagSummarizeService: + def __init__(self): + self.vector_store = VectorStoreSerivce() + self.retriever = self.vector_store.get_retriever() + self.prompt_template = load_rag_prompts() + self.prompt_text = PromptTemplate.from_template(self.prompt_template) + self.model = chat_model + self.chain = self._init_chain() + + + def _init_chain(self): + chain = self.prompt_text | self.model | StrOutputParser() + return chain + + def retriever_docs(self, query: str) -> List[Document]: + return self.retriever.invoke(query) + + def rag_summarize(self, query: str) -> str: + context_docs = self.retriever_docs(query) + context = "" + counter = 0 + for doc in context_docs: + counter += 1 + context += f"[参考资料{counter}]: 参考资料:{doc.page_content} | 参考源: {doc.metadata} \n" + + return self.chain.invoke( + { + "input": query, + "context": context + } + ) + + +if __name__ == '__main__': + rag = RagSummarizeService() + response = rag.rag_summarize("查询小户型适合那些扫地机器人") + print(response) diff --git a/utils/__pycache__/config_handler.cpython-311.pyc b/utils/__pycache__/config_handler.cpython-311.pyc new file mode 100644 index 0000000..5201b7a Binary files /dev/null and b/utils/__pycache__/config_handler.cpython-311.pyc differ diff --git a/utils/__pycache__/file_handler.cpython-311.pyc b/utils/__pycache__/file_handler.cpython-311.pyc new file mode 100644 index 0000000..a1a0c41 Binary files /dev/null and b/utils/__pycache__/file_handler.cpython-311.pyc differ diff --git a/utils/__pycache__/logger_handler.cpython-311.pyc b/utils/__pycache__/logger_handler.cpython-311.pyc new file mode 100644 index 0000000..08fa5dd Binary files /dev/null and b/utils/__pycache__/logger_handler.cpython-311.pyc differ diff --git a/utils/__pycache__/path_tool.cpython-311.pyc b/utils/__pycache__/path_tool.cpython-311.pyc new file mode 100644 index 0000000..c362490 Binary files /dev/null and b/utils/__pycache__/path_tool.cpython-311.pyc differ diff --git a/utils/__pycache__/prompt_loader.cpython-311.pyc b/utils/__pycache__/prompt_loader.cpython-311.pyc new file mode 100644 index 0000000..8518485 Binary files /dev/null and b/utils/__pycache__/prompt_loader.cpython-311.pyc differ diff --git a/utils/file_handler.py b/utils/file_handler.py index 9629a43..cc423dc 100644 --- a/utils/file_handler.py +++ b/utils/file_handler.py @@ -53,4 +53,4 @@ def pdf_loader(file_path: str, pwd: str = None) -> List[Document]: def txt_loader(file_path: str) -> List[Document]: """加载TXT文件""" - return TextLoader(file_path).load() + return TextLoader(file_path, encoding='utf-8').load() diff --git a/utils/logger_handler.py b/utils/logger_handler.py index 65f34b3..62b521b 100644 --- a/utils/logger_handler.py +++ b/utils/logger_handler.py @@ -31,7 +31,7 @@ def get_logger( logger.addHandler(console_handler) if not log_file: - log_file = os.path.join(LOG_ROOT, f"{name}_{datetime.now().strftime("%Y%m%d")}.log") + log_file = os.path.join(LOG_ROOT, f"{name}_{datetime.now().strftime('%Y%m%d')}.log") file_handler = logging.FileHandler(log_file, encoding='utf-8') file_handler.setLevel(file_level)