rag_service

This commit is contained in:
czzhangheng1314 2026-02-28 19:11:09 +08:00
parent 1b791ca1b7
commit 56b276eb3f
15 changed files with 216 additions and 9 deletions

View File

@ -2,7 +2,7 @@
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="edu" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="RAG" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -3,5 +3,5 @@
<component name="Black">
<option name="sdkName" value="Python 3.12 (Eula)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="edu" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="RAG" project-jdk-type="Python SDK" />
</project>

View File

@ -81,3 +81,162 @@ pypdf.errors.PdfReadError: Not an encrypted file
2026-02-28 03:36:40,073 - Agent - INFO - vector_store.py:70 - [加载知识库]: /Users/czzhangheng/edu/agent_proj/data/扫拖一体机器人100问.txt 的 md5已存在, 跳过
2026-02-28 03:36:40,074 - Agent - INFO - vector_store.py:70 - [加载知识库]: /Users/czzhangheng/edu/agent_proj/data/维护保养.txt 的 md5已存在, 跳过
2026-02-28 03:36:51,201 - Agent - INFO - vector_store.py:88 - [加载知识库]/Users/czzhangheng/edu/agent_proj/data/扫地机器人100问.pdf 加载成功
2026-02-28 13:09:38,673 - Agent - INFO - logger_handler.py:47 - info
2026-02-28 13:09:38,673 - Agent - ERROR - logger_handler.py:48 - Error
2026-02-28 13:09:52,924 - Agent - ERROR - vector_store.py:90 - [加载知识库]C:\Users\cz189\agent_proj\data\扫地机器人100问.pdf 失败,错误 `pypdf` package not found, please install it with `pip install pypdf`
Traceback (most recent call last):
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\parsers\pdf.py", line 359, in lazy_parse
import pypdf
ModuleNotFoundError: No module named 'pypdf'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 74, in load_document
document: list[Documents] = get_file_document(path)
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 58, in get_file_document
return pdf_loader(read_path)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\cz189\agent_proj\utils\file_handler.py", line 51, in pdf_loader
return PyPDFLoader(file_path, pwd).load()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_core\document_loaders\base.py", line 43, in load
return list(self.lazy_load())
^^^^^^^^^^^^^^^^^^^^^^
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\pdf.py", line 305, in lazy_load
yield from self.parser.lazy_parse(blob)
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\parsers\pdf.py", line 361, in lazy_parse
raise ImportError(
ImportError: `pypdf` package not found, please install it with `pip install pypdf`
2026-02-28 13:09:52,928 - Agent - ERROR - vector_store.py:90 - [加载知识库]C:\Users\cz189\agent_proj\data\扫地机器人100问2.txt 失败,错误 Error loading C:\Users\cz189\agent_proj\data\扫地机器人100问2.txt
Traceback (most recent call last):
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 43, in lazy_load
text = f.read()
^^^^^^^^
UnicodeDecodeError: 'gbk' codec can't decode byte 0xab in position 4: illegal multibyte sequence
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 74, in load_document
document: list[Documents] = get_file_document(path)
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 55, in get_file_document
return txt_loader(read_path)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\cz189\agent_proj\utils\file_handler.py", line 56, in txt_loader
return TextLoader(file_path).load()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_core\document_loaders\base.py", line 43, in load
return list(self.lazy_load())
^^^^^^^^^^^^^^^^^^^^^^
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 56, in lazy_load
raise RuntimeError(f"Error loading {self.file_path}") from e
RuntimeError: Error loading C:\Users\cz189\agent_proj\data\扫地机器人100问2.txt
2026-02-28 13:09:52,930 - Agent - ERROR - vector_store.py:90 - [加载知识库]C:\Users\cz189\agent_proj\data\扫拖一体机器人100问.txt 失败,错误 Error loading C:\Users\cz189\agent_proj\data\扫拖一体机器人100问.txt
Traceback (most recent call last):
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 43, in lazy_load
text = f.read()
^^^^^^^^
UnicodeDecodeError: 'gbk' codec can't decode byte 0xab in position 4: illegal multibyte sequence
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 74, in load_document
document: list[Documents] = get_file_document(path)
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 55, in get_file_document
return txt_loader(read_path)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\cz189\agent_proj\utils\file_handler.py", line 56, in txt_loader
return TextLoader(file_path).load()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_core\document_loaders\base.py", line 43, in load
return list(self.lazy_load())
^^^^^^^^^^^^^^^^^^^^^^
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 56, in lazy_load
raise RuntimeError(f"Error loading {self.file_path}") from e
RuntimeError: Error loading C:\Users\cz189\agent_proj\data\扫拖一体机器人100问.txt
2026-02-28 13:09:52,933 - Agent - ERROR - vector_store.py:90 - [加载知识库]C:\Users\cz189\agent_proj\data\故障排除.txt 失败,错误 Error loading C:\Users\cz189\agent_proj\data\故障排除.txt
Traceback (most recent call last):
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 43, in lazy_load
text = f.read()
^^^^^^^^
UnicodeDecodeError: 'gbk' codec can't decode byte 0xab in position 2: illegal multibyte sequence
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 74, in load_document
document: list[Documents] = get_file_document(path)
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 55, in get_file_document
return txt_loader(read_path)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\cz189\agent_proj\utils\file_handler.py", line 56, in txt_loader
return TextLoader(file_path).load()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_core\document_loaders\base.py", line 43, in load
return list(self.lazy_load())
^^^^^^^^^^^^^^^^^^^^^^
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 56, in lazy_load
raise RuntimeError(f"Error loading {self.file_path}") from e
RuntimeError: Error loading C:\Users\cz189\agent_proj\data\故障排除.txt
2026-02-28 13:09:52,934 - Agent - ERROR - vector_store.py:90 - [加载知识库]C:\Users\cz189\agent_proj\data\维护保养.txt 失败,错误 Error loading C:\Users\cz189\agent_proj\data\维护保养.txt
Traceback (most recent call last):
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 43, in lazy_load
text = f.read()
^^^^^^^^
UnicodeDecodeError: 'gbk' codec can't decode byte 0xab in position 4: illegal multibyte sequence
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 74, in load_document
document: list[Documents] = get_file_document(path)
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 55, in get_file_document
return txt_loader(read_path)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\cz189\agent_proj\utils\file_handler.py", line 56, in txt_loader
return TextLoader(file_path).load()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_core\document_loaders\base.py", line 43, in load
return list(self.lazy_load())
^^^^^^^^^^^^^^^^^^^^^^
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 56, in lazy_load
raise RuntimeError(f"Error loading {self.file_path}") from e
RuntimeError: Error loading C:\Users\cz189\agent_proj\data\维护保养.txt
2026-02-28 13:09:52,936 - Agent - ERROR - vector_store.py:90 - [加载知识库]C:\Users\cz189\agent_proj\data\选购指南.txt 失败,错误 Error loading C:\Users\cz189\agent_proj\data\选购指南.txt
Traceback (most recent call last):
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 43, in lazy_load
text = f.read()
^^^^^^^^
UnicodeDecodeError: 'gbk' codec can't decode byte 0xab in position 2: illegal multibyte sequence
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 74, in load_document
document: list[Documents] = get_file_document(path)
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\cz189\agent_proj\rag\vector_store.py", line 55, in get_file_document
return txt_loader(read_path)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\cz189\agent_proj\utils\file_handler.py", line 56, in txt_loader
return TextLoader(file_path).load()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_core\document_loaders\base.py", line 43, in load
return list(self.lazy_load())
^^^^^^^^^^^^^^^^^^^^^^
File "D:\Users\cz189\miniconda3\envs\RAG\Lib\site-packages\langchain_community\document_loaders\text.py", line 56, in lazy_load
raise RuntimeError(f"Error loading {self.file_path}") from e
RuntimeError: Error loading C:\Users\cz189\agent_proj\data\选购指南.txt
2026-02-28 13:12:19,618 - Agent - INFO - vector_store.py:88 - [加载知识库]C:\Users\cz189\agent_proj\data\扫地机器人100问.pdf 加载成功
2026-02-28 13:12:22,955 - Agent - INFO - vector_store.py:88 - [加载知识库]C:\Users\cz189\agent_proj\data\扫地机器人100问2.txt 加载成功
2026-02-28 13:12:27,114 - Agent - INFO - vector_store.py:88 - [加载知识库]C:\Users\cz189\agent_proj\data\扫拖一体机器人100问.txt 加载成功
2026-02-28 13:12:34,245 - Agent - INFO - vector_store.py:88 - [加载知识库]C:\Users\cz189\agent_proj\data\故障排除.txt 加载成功
2026-02-28 13:12:38,453 - Agent - INFO - vector_store.py:88 - [加载知识库]C:\Users\cz189\agent_proj\data\维护保养.txt 加载成功
2026-02-28 13:12:42,384 - Agent - INFO - vector_store.py:88 - [加载知识库]C:\Users\cz189\agent_proj\data\选购指南.txt 加载成功

11
md5.txt
View File

@ -1,5 +1,6 @@
94bd6c0b4283b8524f4e8dfc5b72f8ab
a29a4ba5a539a2a462def8f91eb98ccc
928c24dbdeeb1c0a109f5620c8bac429
d294281a5769e0a3d1455fbf637ee572
fc539588792a89ff842cf241cf822de16d726f4604d9e7ebcf59a932ae9e9c00
6d726f4604d9e7ebcf59a932ae9e9c00
4ef62dc4189785055436124361296d05
76d81647e3901472e01ad9859425506f
85fd3b64ff87fe08fdc5c4a34998b555
96a099807d1ad7c26e37b53d130f26d2
1ce3fc50860e15d1c106900b63ef42a2

Binary file not shown.

Binary file not shown.

Binary file not shown.

47
rag/rag_service.py Normal file
View File

@ -0,0 +1,47 @@
"""rag总结服务用户提问搜索参考资料提问+参考资料提供模型,让模型总结回复"""
from langchain_core.output_parsers import StrOutputParser
from rag.vector_store import VectorStoreSerivce
from utils.prompt_loader import load_rag_prompts
from langchain_core.prompts import PromptTemplate
from model.factory import chat_model
from langchain_community.docstore.document import Document
from typing import List
class RagSummarizeService:
def __init__(self):
self.vector_store = VectorStoreSerivce()
self.retriever = self.vector_store.get_retriever()
self.prompt_template = load_rag_prompts()
self.prompt_text = PromptTemplate.from_template(self.prompt_template)
self.model = chat_model
self.chain = self._init_chain()
def _init_chain(self):
chain = self.prompt_text | self.model | StrOutputParser()
return chain
def retriever_docs(self, query: str) -> List[Document]:
return self.retriever.invoke(query)
def rag_summarize(self, query: str) -> str:
context_docs = self.retriever_docs(query)
context = ""
counter = 0
for doc in context_docs:
counter += 1
context += f"[参考资料{counter}]: 参考资料:{doc.page_content} | 参考源: {doc.metadata} \n"
return self.chain.invoke(
{
"input": query,
"context": context
}
)
if __name__ == '__main__':
rag = RagSummarizeService()
response = rag.rag_summarize("查询小户型适合那些扫地机器人")
print(response)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -53,4 +53,4 @@ def pdf_loader(file_path: str, pwd: str = None) -> List[Document]:
def txt_loader(file_path: str) -> List[Document]:
"""加载TXT文件"""
return TextLoader(file_path).load()
return TextLoader(file_path, encoding='utf-8').load()

View File

@ -31,7 +31,7 @@ def get_logger(
logger.addHandler(console_handler)
if not log_file:
log_file = os.path.join(LOG_ROOT, f"{name}_{datetime.now().strftime("%Y%m%d")}.log")
log_file = os.path.join(LOG_ROOT, f"{name}_{datetime.now().strftime('%Y%m%d')}.log")
file_handler = logging.FileHandler(log_file, encoding='utf-8')
file_handler.setLevel(file_level)