Day23-把所有的產品資料用 LangChain 的規格儲存到 Qdrant

2024 iThome 鐵人賽

DAY 23

生成式 AI

生成式 AI 時代下的 Azure Machine Learning系列第 23 篇

16th鐵人賽 azure machine learning langchain cohere-embed-v3-multilingual

大魔術熊貓工程師

2024-10-07 00:43:58

367 瀏覽

分享至

今天我們就來整理產品資料，而且我們要整理成 LangChain Document 的通用格式，然後儲存到向量資料庫之中。

下面是我們整理好的產品型錄：

[
{"品名": "ResiMax RM1001", "介紹": "RM1001是電阻器，具有高精度和穩定性，非常適合精密電路設計。"},
{"品名": "CapacitorPro CP2205", "介紹": "CP2205是電容器，具有極低的等效串聯電阻（ESR），適合高頻濾波應用。"},
{"品名": "TransiTech TT500N", "介紹": "TT500N是一款晶體管，具有高增益和快速切換特性，適合開關電源設計。"},
{"品名": "InduMax IM300", "介紹": "IM300是一款電感器，具有高穩定性的自諧振頻率，適合高精度濾波應用。"},
{"品名": "DiodeX DX100S", "介紹": "DX100S是一款蕭特基二極體，具有低正向壓降和快速反向恢復，適合高效能電源設計。"},
{"品名": "InduMax IM450Q", "介紹": "IM450Q是一款電感器，具有極高的Q值，非常適合射頻濾波器和振盪器設計。"},
{"品名": "PowerGuard PG500", "介紹": "PG500是一款電源穩壓器，具有寬廣的輸入電壓範圍和高穩壓精度，適合各類電子設備。"},
{"品名": "CMOSPro CP3000", "介紹": "CP3000是一款基於CMOS技術的晶片，具有低功耗和高集成度，適合數位電路設計。"},
{"品名": "MicroBoard MB1000", "介紹": "MB1000是一款微控制器，具備多種接口和擴展功能，適合快速開發和測試各類嵌入式應用，用於嵌入式系統和原型設計。"}
]

然後因為 LangChain 還沒有針對 Azure ML 的 embedding model 的 serverless endpoint 做整合，那麼就自己寫一個吧！我還發了 PR 給 LangChain，看看他們要不要 merge。 https://github.com/langchain-ai/langchain/pull/27148
我們可以用上次 Day 11 的手法，從我的 GitHub repo 裡來抓，也可以直接在專案裡新增一個 azure_ml_embeddings.py 檔案，貼上下面的程式碼：

import json
import os
from typing import Any, List, Optional

from langchain_core.embeddings import Embeddings
from langchain_core.utils import from_env
from pydantic import BaseModel, ConfigDict, Field, model_validator
from typing_extensions import Self
import requests
import asyncio
import aiohttp

DEFAULT_MODEL = "Cohere-embed-v3-multilingual"

class AzureMLEndpointEmbeddings(BaseModel, Embeddings):
    """Azure ML embedding endpoint for embeddings.

    To use, set up Azure ML API endpoint and provide the endpoint URL and API key.

    Example:
        .. code-block:: python

            from my_module import AzureMLEndpointEmbeddings
            azure_ml = AzureMLEndpointEmbeddings(
                embed_url="https://Cohere-embed-v3-multilingual.azure.com/embeddings",
                api_key="your-api-key"
            )
    """

    client: Any = None  #: :meta private:
    async_client: Any = None  #: :meta private:
    embed_url: Optional[str] = None
    """Azure ML endpoint URL to use for embedding."""
    api_key: Optional[str] = Field(
        default_factory=from_env("AZURE_ML_API_KEY", default=None)
    )
    """API Key to use for authentication."""

    model_kwargs: Optional[dict] = None
    """Keyword arguments to pass to the embedding API."""

    model_config = ConfigDict(
        extra="forbid",
        protected_namespaces=(),
    )

    @model_validator(mode="after")
    def validate_environment(self) -> Self:
        """Validate that api key exists in environment."""
        if not self.api_key:
            self.api_key = os.getenv("AZURE_ML_API_KEY")

        if not self.api_key:
            raise ValueError("API Key must be provided or set in the environment.")

        if not self.embed_url:
            raise ValueError("Azure ML endpoint URL must be provided.")

        return self

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Call Azure ML embedding endpoint to embed documents.

        Args:
            texts: The list of texts to embed.

        Returns:
            List of embeddings, one for each text.
        """
        texts = [text.replace("\n", " ") for text in texts]
        data = {
            "input": texts
        }
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }

        response = requests.post(self.embed_url, headers=headers, data=json.dumps(data))

        if response.status_code == 200:
            response_data = response.json()
            embeddings = [item['embedding'] for item in response_data['data']]
            return embeddings
        else:
            raise Exception(f"Error: {response.status_code}, {response.text}")

    async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
        """Async Call to Azure ML embedding endpoint to embed documents.

        Args:
            texts: The list of texts to embed.

        Returns:
            List of embeddings, one for each text.
        """
        texts = [text.replace("\n", " ") for text in texts]
        data = {
            "input": texts
        }
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }

        async with aiohttp.ClientSession() as session:
            async with session.post(self.embed_url, headers=headers, json=data) as response:
                if response.status == 200:
                    response_data = await response.json()
                    embeddings = [item['embedding'] for item in response_data['data']]
                    return embeddings
                else:
                    response_text = await response.text()
                    raise Exception(f"Error: {response.status}, {response_text}")

    def embed_query(self, text: str) -> List[float]:
        """Call Azure ML embedding endpoint to embed a single query text.

        Args:
            text: The text to embed.

        Returns:
            Embeddings for the text.
        """
        response = self.embed_documents([text])[0]
        return response

    async def aembed_query(self, text: str) -> List[float]:
        """Async Call to Azure ML embedding endpoint to embed a single query text.

        Args:
            text: The text to embed.

        Returns:
            Embeddings for the text.
        """
        response = (await self.aembed_documents([text]))[0]
        return response

再來，我們在 day23.py 裡，貼上下面的程式碼並運作。

from langchain_community.vectorstores import Qdrant
from langchain_core.documents import Document
from azure_ml_embeddings import AzureMLEndpointEmbeddings

cohere_embed_api_key = "xx"
cohere_embed_url = "https://Cohere-embed-v3-multilingual-yqv.westus3.models.ai.azure.com/embeddings"
qdrant_url = "https://xx.cloud.qdrant.io:6333"
qdrant_api_key = "xx"

products = [
    {"品名": "ResiMax RM1001", "介紹": "RM1001是電阻器，具有高精度和穩定性，非常適合精密電路設計。"},
    {"品名": "CapacitorPro CP2205", "介紹": "CP2205是電容器，具有極低的等效串聯電阻（ESR），適合高頻濾波應用。"},
    {"品名": "TransiTech TT500N", "介紹": "TT500N是一款晶體管，具有高增益和快速切換特性，適合開關電源設計。"},
    {"品名": "InduMax IM300", "介紹": "IM300是一款電感器，具有高穩定性的自諧振頻率，適合高精度濾波應用。"},
    {"品名": "DiodeX DX100S", "介紹": "DX100S是一款蕭特基二極體，具有低正向壓降和快速反向恢復，適合高效能電源設計。"},
    {"品名": "InduMax IM450Q", "介紹": "IM450Q是一款電感器，具有極高的Q值，非常適合射頻濾波器和振盪器設計。"},
    {"品名": "PowerGuard PG500", "介紹": "PG500是一款電源穩壓器，具有寬廣的輸入電壓範圍和高穩壓精度，適合各類電子設備。"},
    {"品名": "CMOSPro CP3000", "介紹": "CP3000是一款基於CMOS技術的晶片，具有低功耗和高集成度，適合數位電路設計。"},
    {"品名": "MicroBoard MB1000", "介紹": "MB1000是一款微控制器，具備多種接口和擴展功能，適合快速開發和測試各類嵌入式應用，用於嵌入式系統和原型設計。"}
]

docs = []
for product in products:
    page_content = f"電子零件 - {product['品名'].split()[1]}\n型號: {product['品名']}\n功能: {product['介紹']}\n"
    
    doc = Document(
        page_content=page_content,
        metadata={
            "department": "electronic",
            "source": "產品型錄.pdf"
        }
    )
    docs.append(doc)



embeddings_model = AzureMLEndpointEmbeddings(embed_url=cohere_embed_url, api_key=cohere_embed_api_key)


qdrant = Qdrant.from_documents(
    docs,
    embeddings_model,
    url=qdrant_url, 
    api_key=qdrant_api_key,
    collection_name="ironman2024",
    force_recreate=True,
)