🎉 First commit
This commit is contained in:
commit
1ca4fda4d6
10 changed files with 569 additions and 0 deletions
151
database_handler.py
Normal file
151
database_handler.py
Normal file
|
@ -0,0 +1,151 @@
|
|||
"""
|
||||
CLI handler to create or update the Chroma database.
|
||||
|
||||
Author
|
||||
------
|
||||
Nicolas Rojas
|
||||
"""
|
||||
|
||||
# import libraries
|
||||
from argparse import ArgumentParser
|
||||
import os.path
|
||||
from datetime import datetime
|
||||
import yaml
|
||||
import chromadb
|
||||
from llama_index.core import (
|
||||
VectorStoreIndex,
|
||||
SimpleDirectoryReader,
|
||||
StorageContext,
|
||||
Settings,
|
||||
)
|
||||
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
||||
from llama_index.vector_stores.chroma import ChromaVectorStore
|
||||
|
||||
|
||||
def create_index(
|
||||
chroma_collection_name: str,
|
||||
documents_dir: str,
|
||||
index_dir: str,
|
||||
embedding_model: str,
|
||||
):
|
||||
"""Create vector database from documents folder.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
chroma_collection_name : str
|
||||
Name of the Chroma collection to be created.
|
||||
documents_dir : str
|
||||
Directory where the documents are stored.
|
||||
index_dir : str
|
||||
Directory where the index is saved.
|
||||
embedding_model : str
|
||||
Huggingface embedding model to vectorize the documents.
|
||||
"""
|
||||
# create Chroma vector store
|
||||
chroma_client = chromadb.PersistentClient(path=index_dir)
|
||||
chroma_collection = chroma_client.get_or_create_collection(chroma_collection_name)
|
||||
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
|
||||
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
||||
|
||||
# load the documents and create the index
|
||||
embed_model = HuggingFaceEmbedding(model_name=embedding_model)
|
||||
documents = SimpleDirectoryReader(documents_dir).load_data()
|
||||
index = VectorStoreIndex.from_documents(
|
||||
documents, storage_context=storage_context, embed_model=embed_model
|
||||
)
|
||||
|
||||
|
||||
def update_index(
|
||||
chroma_collection_name: str,
|
||||
documents_dir: str,
|
||||
index_dir: str,
|
||||
embedding_model: str,
|
||||
):
|
||||
"""Update vector database with new or changed files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
chroma_collection_name : str
|
||||
Name of the Chroma collection to be updated.
|
||||
documents_dir : str
|
||||
Directory where the documents are stored.
|
||||
index_dir : str
|
||||
Directory where the index is saved.
|
||||
embedding_model : str
|
||||
Huggingface embedding model to vectorize the documents.
|
||||
"""
|
||||
# load the existing index
|
||||
chroma_client = chromadb.PersistentClient(path=index_dir)
|
||||
chroma_collection = chroma_client.get_or_create_collection(chroma_collection_name)
|
||||
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
|
||||
embed_model = HuggingFaceEmbedding(model_name=embedding_model)
|
||||
index = VectorStoreIndex.from_vector_store(
|
||||
vector_store,
|
||||
embed_model=embed_model,
|
||||
)
|
||||
documents = SimpleDirectoryReader(documents_dir).load_data()
|
||||
|
||||
for doc in documents:
|
||||
doc_id = doc.metadata["file_path"]
|
||||
|
||||
# Check if document already exists in the index
|
||||
existing_node = index.docstore.document_exists(doc_id)
|
||||
|
||||
if existing_node:
|
||||
existing_mtime = datetime.fromisoformat(
|
||||
existing_node.metadata["last_modified"]
|
||||
)
|
||||
current_mtime = datetime.fromtimestamp(os.path.getmtime(doc_id))
|
||||
|
||||
# If the file has been modified, update it
|
||||
if current_mtime > existing_mtime:
|
||||
index.update_ref_doc(doc_id, doc)
|
||||
else:
|
||||
# If the document doesn't exist, insert it
|
||||
index.insert(doc)
|
||||
# Persist changes
|
||||
index.storage_context.persist(persist_dir=index_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = ArgumentParser(
|
||||
description="Utility to manage database index with LlamaIndex and Chroma"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-c", "--create", action="store_true", help="Create the database"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-u", "--update", action="store_true", help="Update the database"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
# load configuration variables
|
||||
with open("config.yaml", "r", encoding="utf8") as yfile:
|
||||
parameters = yaml.safe_load(yfile)
|
||||
|
||||
index_dir = parameters["index_directory"]
|
||||
chunk_size = parameters["chunk_size"]
|
||||
embedding_model = parameters["embedding_model"]
|
||||
chroma_collection = parameters["chroma_collection"]
|
||||
documents_dir = parameters["documents_dir"]
|
||||
|
||||
# Set custom RAG settings
|
||||
Settings.chunk_size = chunk_size
|
||||
Settings.embed_model = HuggingFaceEmbedding(model_name=embedding_model)
|
||||
|
||||
# if both arguments are true or both are false, throw error
|
||||
if args.create == args.update:
|
||||
raise ValueError(
|
||||
"Use the program with argument -c OR -u. Use flag -h for help."
|
||||
)
|
||||
|
||||
# create database if doesnt exist yet
|
||||
if args.create:
|
||||
if os.path.exists(index_dir):
|
||||
raise FileExistsError(f"The file {index_dir} already exists")
|
||||
create_index(chroma_collection, documents_dir, index_dir, embedding_model)
|
||||
|
||||
# update existing database
|
||||
elif args.update:
|
||||
if not os.path.exists(index_dir):
|
||||
raise FileNotFoundError(f"The file {index_dir} does not exist")
|
||||
update_index(chroma_collection, documents_dir, index_dir, embedding_model)
|
Loading…
Add table
Add a link
Reference in a new issue