iT邦幫忙

2024 iThome 鐵人賽

DAY 14
0
生成式 AI

智能雲端架構圖生成:結合LangChain&LangGrpah與Rag的創新應用系列 第 14

day14 chatDocument 文件分析器(二):混合資料格式解析!完全版chatDocument接受多檔案多來源的文件分析器

  • 分享至 

  • xImage
  •  

前言

昨天我們利用streamlit快速建構出csv檔案的智能文件分析器,今天我們將文件分析器利用自定義的Loader擴充py、text、pdf等資料格式,甚至我們還能上傳多個不同格式的檔案,使得LLM接受的來源更加多元!

正文

  • 首先我們會進行一些修改,但大部分的內容都是重複的!

匯入模型

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import os
# 1. Create model
os.environ["OPENAI_API_KEY"] = "你的OpenAI API key"

model = ChatOpenAI(model="gpt-4o")

embeddingmodel = OpenAIEmbeddings(model="text-embedding-ada-002")

製作Loader

  • 製作Loader時,我們會在lazy_load方法當中,查看尾端的副檔名,並且導入到不同的Loader之中
from langchain_core.document_loaders import BaseLoader
from typing import Iterator
from langchain_core.documents import Document
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders import PyPDFLoader
# 2. 製作loader
class CustomDocumentLoader(BaseLoader):
    def __init__(self, file_path: str) -> None:
        self.file_path = file_path

    def lazy_load(self) -> Iterator[Document]:
        if self.file_path.endswith('.csv'):
            loader = CSVLoader(self.file_path)
            for doc in loader.lazy_load():
                yield doc
        elif self.file_path.endswith('.pdf'):
            loader = PyPDFLoader(self.file_path)
            for doc in loader.lazy_load():
                yield doc
        else:
            with open(self.file_path, encoding="utf-8") as f:
                line_number = 0
                for line in f:
                    yield Document(
                        page_content=line,
                        metadata={"line_number": line_number,
                                  "source": self.file_path},
                    )
                    line_number += 1

process_documents

  • 這裡是作為streamlit的資料處理,也就是接受檔案以及使用者輸入的處理流程
  • 主要的更改在前面的files輸入,我們在streamlit的修改會變成多檔案的接收,因此在使用for 並且用一個all_docs的list 來載入多檔案資料源
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
def process_documents(files, user_input):
    all_docs = []

    for uploaded_file in files:
        # 暫存上檔案
        temp_file_path = f'''temp_uploaded_file_{
            uploaded_file.name.split('.')[-1]}'''
        with open(temp_file_path, "wb") as temp_file:
            temp_file.write(uploaded_file.getvalue())

        # 1. 載入檔案
        loader = CustomDocumentLoader(temp_file_path)
        docs = list(loader.lazy_load())
        all_docs.extend(docs)

        # 刪除暫存檔案
        os.remove(temp_file_path)

    # 2. 切分成數個chunk,自定切割大小,自訂overlap
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400, chunk_overlap=100)
    splits = text_splitter.split_documents(docs)

    # 3. 轉換成embedding,儲存進Chroma向量資料庫
    vectorstore = Chroma.from_documents(
        documents=splits, embedding=embeddingmodel)
    retriever = vectorstore.as_retriever()

    # 4. 做成chain.
    template = """
    你是一個人工智慧輔助系統,可以根據文件的內容進行簡單的總結,並且根據使用者需求來進行分析
    以下為文件的內容:
    {context}
    使用者提供的額外資訊:
    {user_input}
    """
    custom_rag_prompt = PromptTemplate.from_template(template)

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    rag_chain = (
        {"context": retriever | format_docs, "user_input": lambda x: x}
        | custom_rag_prompt
        | model
        | StrOutputParser()
    )

    # 5. 呼叫
    result = rag_chain.invoke(user_input)
    return result

run_streamlit

  • 主要的變更是file_uploader 多了個accept_multiple_files=True的參數,可以在介面上傳多個檔案
  • 另外type的部分改成 type=["txt", "py", "csv", "pdf"],接收更多元的資料
import streamlit as st
def run_streamlit():
    st.set_page_config(
        page_title="文件分析與總結器", page_icon="📄")
    st.title("文件分析與總結器")
    st.write("")
    st.write("### Step 1: 上傳文件")
    uploaded_files = st.file_uploader(
        "Choose a file", type=["txt", "py", "csv"], label_visibility="visible", accept_multiple_files=True)
    st.write("")
    st.write("### Step 2: 輸入額外資訊,讓人工智慧處理")
    user_input = st.text_input(
        "Enter Additional Information", "")
    st.write("")
    st.write("### Step 3: 提交")
    submit_button = st.button("Submit", use_container_width=True)
    if submit_button:
        if uploaded_files is not None:
            file_names = ", ".join([file.name for file in uploaded_files])
            st.write(f"**檔案名稱:** {file_names}")
            st.write(f"**額外資訊:** {user_input}")

            # Process the document
            summary = process_documents(uploaded_files, user_input)

            st.write("")
            st.write("### Summary:")
            st.write(summary)
        else:
            st.error("請先上傳檔案")


if __name__ == "__main__":
    run_streamlit()

完整程式碼

# streamlit
import streamlit as st
# langChain
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import os
from langchain_core.document_loaders import BaseLoader
from typing import Iterator
from langchain_core.documents import Document
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate

# 啟動命令 streamlit run filename.py

# 1. Create model
os.environ["OPENAI_API_KEY"] = "你的OpenAI API key"

model = ChatOpenAI(model="gpt-4o")

embeddingmodel = OpenAIEmbeddings(model="text-embedding-ada-002")


# 2. 製作loader
class CustomDocumentLoader(BaseLoader):
    def __init__(self, file_path: str) -> None:
        self.file_path = file_path

    def lazy_load(self) -> Iterator[Document]:
        if self.file_path.endswith('.csv'):
            loader = CSVLoader(self.file_path)
            for doc in loader.lazy_load():
                yield doc
        elif self.file_path.endswith('.pdf'):
            loader = PyPDFLoader(self.file_path)
            for doc in loader.lazy_load():
                yield doc
        else:
            with open(self.file_path, encoding="utf-8") as f:
                line_number = 0
                for line in f:
                    yield Document(
                        page_content=line,
                        metadata={"line_number": line_number,
                                  "source": self.file_path},
                    )
                    line_number += 1


def process_documents(files, user_input):
    all_docs = []

    for uploaded_file in files:
        # 暫存上檔案
        temp_file_path = f'''temp_uploaded_file_{
            uploaded_file.name.split('.')[-1]}'''
        with open(temp_file_path, "wb") as temp_file:
            temp_file.write(uploaded_file.getvalue())

        # 1. 載入檔案
        loader = CustomDocumentLoader(temp_file_path)
        docs = list(loader.lazy_load())
        all_docs.extend(docs)

        # 刪除暫存檔案
        os.remove(temp_file_path)

    # 2. 切分成數個chunk,自定切割大小,自訂overlap
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400, chunk_overlap=100)
    splits = text_splitter.split_documents(docs)

    # 3. 轉換成embedding,儲存進Chroma向量資料庫
    vectorstore = Chroma.from_documents(
        documents=splits, embedding=embeddingmodel)
    retriever = vectorstore.as_retriever()

    # 4. 做成chain.
    template = """
    你是一個人工智慧輔助系統,可以根據文件的內容進行簡單的總結,並且根據使用者需求來進行分析
    以下為文件的內容:
    {context}
    使用者提供的額外資訊:
    {user_input}
    """
    custom_rag_prompt = PromptTemplate.from_template(template)

    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    rag_chain = (
        {"context": retriever | format_docs, "user_input": lambda x: x}
        | custom_rag_prompt
        | model
        | StrOutputParser()
    )

    # 5. 呼叫
    result = rag_chain.invoke(user_input)
    return result


def run_streamlit():
    st.set_page_config(
        page_title="文件分析與總結器", page_icon="📄")
    st.title("文件分析與總結器")
    st.write("")
    st.write("### Step 1: 上傳文件")
    uploaded_files = st.file_uploader(
        "Choose a file", type=["txt", "py", "csv", "pdf"], label_visibility="visible", accept_multiple_files=True)
    st.write("")
    st.write("### Step 2: 輸入額外資訊,讓人工智慧處理")
    user_input = st.text_input(
        "Enter Additional Information", "")
    st.write("")
    st.write("### Step 3: 提交")
    submit_button = st.button("Submit", use_container_width=True)
    if submit_button:
        if uploaded_files is not None:
            file_names = ", ".join([file.name for file in uploaded_files])
            st.write(f"**檔案名稱:** {file_names}")
            st.write(f"**額外資訊:** {user_input}")

            # Process the document
            summary = process_documents(uploaded_files, user_input)

            st.write("")
            st.write("### Summary:")
            st.write(summary)
        else:
            st.error("請先上傳檔案")


if __name__ == "__main__":
    run_streamlit()

執行命令

  • 同樣的我們將檔案儲存成day14.py,那麼執行命令
streamlit run day14.py

執行結果

  • 我們上傳day10的csv檔案,內容有中低收入戶的資料,來源於政府的公開資料平台,以及day11的OpenCV 只有畫出矩形的程式碼範例檔,並且有如下輸入
  • 幫我利用裡面的程式碼的函式庫,進行低收入戶的視覺化顯示,只要輸出程式碼即可
  • 儘管這個使用者輸入有點複雜,但我還是相信他能夠從程式碼範例檔獲取出函式庫,並製作範例
    https://ithelp.ithome.com.tw/upload/images/20240913/20168697NOWVOBxexP.png

https://ithelp.ithome.com.tw/upload/images/20240913/20168697nWtuyBDmUh.png
https://ithelp.ithome.com.tw/upload/images/20240913/20168697bVPuoR8eW3.png


上一篇
day13 chatDocument 文件分析器(一):智能CSV分析與StreamLit前端介面
下一篇
day15 繪製架構圖工具介紹:Plant UML、Diagram as Code 、Eraser
系列文
智能雲端架構圖生成:結合LangChain&LangGrpah與Rag的創新應用28
圖片
  直播研討會
圖片
{{ item.channelVendor }} {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言