151 lines
4.9 KiB
Python
151 lines
4.9 KiB
Python
"""
|
|
CLI handler to create or update the Chroma database.
|
|
|
|
Author
|
|
------
|
|
Nicolas Rojas
|
|
"""
|
|
|
|
# import libraries
|
|
from argparse import ArgumentParser
|
|
import os.path
|
|
from datetime import datetime
|
|
import yaml
|
|
import chromadb
|
|
from llama_index.core import (
|
|
VectorStoreIndex,
|
|
SimpleDirectoryReader,
|
|
StorageContext,
|
|
Settings,
|
|
)
|
|
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
|
from llama_index.vector_stores.chroma import ChromaVectorStore
|
|
|
|
|
|
def create_index(
|
|
chroma_collection_name: str,
|
|
documents_dir: str,
|
|
index_dir: str,
|
|
embedding_model: str,
|
|
):
|
|
"""Create vector database from documents folder.
|
|
|
|
Parameters
|
|
----------
|
|
chroma_collection_name : str
|
|
Name of the Chroma collection to be created.
|
|
documents_dir : str
|
|
Directory where the documents are stored.
|
|
index_dir : str
|
|
Directory where the index is saved.
|
|
embedding_model : str
|
|
Huggingface embedding model to vectorize the documents.
|
|
"""
|
|
# create Chroma vector store
|
|
chroma_client = chromadb.PersistentClient(path=index_dir)
|
|
chroma_collection = chroma_client.get_or_create_collection(chroma_collection_name)
|
|
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
|
|
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
|
|
|
# load the documents and create the index
|
|
embed_model = HuggingFaceEmbedding(model_name=embedding_model)
|
|
documents = SimpleDirectoryReader(documents_dir).load_data()
|
|
index = VectorStoreIndex.from_documents(
|
|
documents, storage_context=storage_context, embed_model=embed_model
|
|
)
|
|
|
|
|
|
def update_index(
|
|
chroma_collection_name: str,
|
|
documents_dir: str,
|
|
index_dir: str,
|
|
embedding_model: str,
|
|
):
|
|
"""Update vector database with new or changed files.
|
|
|
|
Parameters
|
|
----------
|
|
chroma_collection_name : str
|
|
Name of the Chroma collection to be updated.
|
|
documents_dir : str
|
|
Directory where the documents are stored.
|
|
index_dir : str
|
|
Directory where the index is saved.
|
|
embedding_model : str
|
|
Huggingface embedding model to vectorize the documents.
|
|
"""
|
|
# load the existing index
|
|
chroma_client = chromadb.PersistentClient(path=index_dir)
|
|
chroma_collection = chroma_client.get_or_create_collection(chroma_collection_name)
|
|
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
|
|
embed_model = HuggingFaceEmbedding(model_name=embedding_model)
|
|
index = VectorStoreIndex.from_vector_store(
|
|
vector_store,
|
|
embed_model=embed_model,
|
|
)
|
|
documents = SimpleDirectoryReader(documents_dir).load_data()
|
|
|
|
for doc in documents:
|
|
doc_id = doc.metadata["file_path"]
|
|
|
|
# Check if document already exists in the index
|
|
existing_node = index.docstore.document_exists(doc_id)
|
|
|
|
if existing_node:
|
|
existing_mtime = datetime.fromisoformat(
|
|
existing_node.metadata["last_modified"]
|
|
)
|
|
current_mtime = datetime.fromtimestamp(os.path.getmtime(doc_id))
|
|
|
|
# If the file has been modified, update it
|
|
if current_mtime > existing_mtime:
|
|
index.update_ref_doc(doc_id, doc)
|
|
else:
|
|
# If the document doesn't exist, insert it
|
|
index.insert(doc)
|
|
# Persist changes
|
|
index.storage_context.persist(persist_dir=index_dir)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = ArgumentParser(
|
|
description="Utility to manage database index with LlamaIndex and Chroma"
|
|
)
|
|
parser.add_argument(
|
|
"-c", "--create", action="store_true", help="Create the database"
|
|
)
|
|
parser.add_argument(
|
|
"-u", "--update", action="store_true", help="Update the database"
|
|
)
|
|
args = parser.parse_args()
|
|
# load configuration variables
|
|
with open("config.yaml", "r", encoding="utf8") as yfile:
|
|
parameters = yaml.safe_load(yfile)
|
|
|
|
index_dir = parameters["index_directory"]
|
|
chunk_size = parameters["chunk_size"]
|
|
embedding_model = parameters["embedding_model"]
|
|
chroma_collection = parameters["chroma_collection"]
|
|
documents_dir = parameters["documents_dir"]
|
|
|
|
# Set custom RAG settings
|
|
Settings.chunk_size = chunk_size
|
|
Settings.embed_model = HuggingFaceEmbedding(model_name=embedding_model)
|
|
|
|
# if both arguments are true or both are false, throw error
|
|
if args.create == args.update:
|
|
raise ValueError(
|
|
"Use the program with argument -c OR -u. Use flag -h for help."
|
|
)
|
|
|
|
# create database if doesnt exist yet
|
|
if args.create:
|
|
if os.path.exists(index_dir):
|
|
raise FileExistsError(f"The file {index_dir} already exists")
|
|
create_index(chroma_collection, documents_dir, index_dir, embedding_model)
|
|
|
|
# update existing database
|
|
elif args.update:
|
|
if not os.path.exists(index_dir):
|
|
raise FileNotFoundError(f"The file {index_dir} does not exist")
|
|
update_index(chroma_collection, documents_dir, index_dir, embedding_model)
|