docuchat/database_handler.py
2025-05-29 21:08:20 -05:00

151 lines
4.9 KiB
Python

"""
CLI handler to create or update the Chroma database.
Author
------
Nicolas Rojas
"""
# import libraries
from argparse import ArgumentParser
import os.path
from datetime import datetime
import yaml
import chromadb
from llama_index.core import (
VectorStoreIndex,
SimpleDirectoryReader,
StorageContext,
Settings,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
def create_index(
chroma_collection_name: str,
documents_dir: str,
index_dir: str,
embedding_model: str,
):
"""Create vector database from documents folder.
Parameters
----------
chroma_collection_name : str
Name of the Chroma collection to be created.
documents_dir : str
Directory where the documents are stored.
index_dir : str
Directory where the index is saved.
embedding_model : str
Huggingface embedding model to vectorize the documents.
"""
# create Chroma vector store
chroma_client = chromadb.PersistentClient(path=index_dir)
chroma_collection = chroma_client.get_or_create_collection(chroma_collection_name)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# load the documents and create the index
embed_model = HuggingFaceEmbedding(model_name=embedding_model)
documents = SimpleDirectoryReader(documents_dir).load_data()
index = VectorStoreIndex.from_documents(
documents, storage_context=storage_context, embed_model=embed_model
)
def update_index(
chroma_collection_name: str,
documents_dir: str,
index_dir: str,
embedding_model: str,
):
"""Update vector database with new or changed files.
Parameters
----------
chroma_collection_name : str
Name of the Chroma collection to be updated.
documents_dir : str
Directory where the documents are stored.
index_dir : str
Directory where the index is saved.
embedding_model : str
Huggingface embedding model to vectorize the documents.
"""
# load the existing index
chroma_client = chromadb.PersistentClient(path=index_dir)
chroma_collection = chroma_client.get_or_create_collection(chroma_collection_name)
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
embed_model = HuggingFaceEmbedding(model_name=embedding_model)
index = VectorStoreIndex.from_vector_store(
vector_store,
embed_model=embed_model,
)
documents = SimpleDirectoryReader(documents_dir).load_data()
for doc in documents:
doc_id = doc.metadata["file_path"]
# Check if document already exists in the index
existing_node = index.docstore.document_exists(doc_id)
if existing_node:
existing_mtime = datetime.fromisoformat(
existing_node.metadata["last_modified"]
)
current_mtime = datetime.fromtimestamp(os.path.getmtime(doc_id))
# If the file has been modified, update it
if current_mtime > existing_mtime:
index.update_ref_doc(doc_id, doc)
else:
# If the document doesn't exist, insert it
index.insert(doc)
# Persist changes
index.storage_context.persist(persist_dir=index_dir)
if __name__ == "__main__":
parser = ArgumentParser(
description="Utility to manage database index with LlamaIndex and Chroma"
)
parser.add_argument(
"-c", "--create", action="store_true", help="Create the database"
)
parser.add_argument(
"-u", "--update", action="store_true", help="Update the database"
)
args = parser.parse_args()
# load configuration variables
with open("config.yaml", "r", encoding="utf8") as yfile:
parameters = yaml.safe_load(yfile)
index_dir = parameters["index_directory"]
chunk_size = parameters["chunk_size"]
embedding_model = parameters["embedding_model"]
chroma_collection = parameters["chroma_collection"]
documents_dir = parameters["documents_dir"]
# Set custom RAG settings
Settings.chunk_size = chunk_size
Settings.embed_model = HuggingFaceEmbedding(model_name=embedding_model)
# if both arguments are true or both are false, throw error
if args.create == args.update:
raise ValueError(
"Use the program with argument -c OR -u. Use flag -h for help."
)
# create database if doesnt exist yet
if args.create:
if os.path.exists(index_dir):
raise FileExistsError(f"The file {index_dir} already exists")
create_index(chroma_collection, documents_dir, index_dir, embedding_model)
# update existing database
elif args.update:
if not os.path.exists(index_dir):
raise FileNotFoundError(f"The file {index_dir} does not exist")
update_index(chroma_collection, documents_dir, index_dir, embedding_model)