Open5

momento vector index とlangchain, llama_indexを連携してドキュメントを格納する

biosugar0biosugar0
    from llama_index import download_loader
    ZendeskReader = download_loader("ZendeskReader")


loader.load_langchain_documents()
して取得したDocumentをmomento vector indexに入れる

biosugar0biosugar0

Document.metadata["id"] の値をintからstrに変換しないとSDK Failedエラー。

momento.errors.exceptions.UnknownException: UnknownException(message='SDK Failed to process the request.', error_code=<MomentoErrorCode.UNKNOWN_ERROR: 15>, transport_details=None, message_wrapper='Unknown error has occurred')

こうしたらいけた

    docs_payload = []
    for d in documents:
        d.metadata["id"] = str(d.metadata["id"])
        docs_payload.append(d)

    vector_db = MomentoVectorIndex.from_documents(
        docs_payload,
        OpenAIEmbeddings(),
        index_name="test",
    )
biosugar0biosugar0

langchain経由でupsertするとなんか重複して入っている気がする?

biosugar0biosugar0

こんな感じ

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import MomentoVectorIndex
from llama_index import download_loader

def load_index():
    ZendeskReader = download_loader("ZendeskReader")

    loader = ZendeskReader(zendesk_subdomain="your_zendesk", locale="ja")
    # load data from Zendesk
    docs = loader.load_langchain_documents()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=0,
    )

    documents = text_splitter.split_documents(docs)

    docs_payload = []
    for d in documents:
        d.metadata["id"] = str(
            d.metadata["id"]
        )  # need to convert to string for momento
        docs_payload.append(d)

    vector_db = MomentoVectorIndex.from_documents(
        docs_payload,
        OpenAIEmbeddings(),
        index_name="zendesk_smartmat",
    )

if __name__ == "__main__":
    load_index()

biosugar0biosugar0

momento python sdk v1.11.0の更新で以下でindexを作れるようになった。

def load_index():
    ZendeskReader = download_loader("ZendeskReader")
    loader = ZendeskReader(zendesk_subdomain=ZENDESK_SUBDOMAIN, locale=LOCALE)

    # load data from Zendesk
    docs = loader.load_langchain_documents()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

    documents = text_splitter.split_documents(docs)

    MomentoVectorIndex.from_documents(
        documents,
        OpenAIEmbeddings(),
        index_name=INDEX_NAME,
    )


if __name__ == "__main__":
    load_index()