🎉 First commit

2025-05-29 21:08:20 -05:00 · 2025-05-29 21:08:20 -05:00 · 1ca4fda4d6
commit 1ca4fda4d6
10 changed files with 569 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,165 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Data folder
+sagemaker_documentation
--- a/13
+++ b/13
@ -0,0 +1,13 @@
+Copyright (c) 2024 Nicolas Rojas
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
--- a/README.md
+++ b/README.md
@ -0,0 +1,48 @@
+# Docuchat
+
+**By: Nicolas Rojas**
+
+Documentation-based RAG pipeline.
+
+This repository contains the source code to build a RAG application that allows questioning a chatbot about internal documentation, using LlamaIndex and Ollama with locally-deployed large language models.
+
+## Installation
+
+Run the following command to install the dependencies:
+
+```shell
+sh setup.sh
+```
+
+This will install the required python libraries, the language model, and create the documentation database.
+
+## Running
+
+The RAG backend can be run with the following command:
+
+```shell
+uvicorn backend:app --host 0.0.0.0 --port 8000 --reload
+```
+
+This will mount an API on `localhost:8000`, that can be accessed with the frontend interface.
+
+To use the frontend web application, run `python frontend.py` and the interface will open on `localhost:7860`. You will be able to ask questions about the documents from there.
+
+## Rationale
+
+This repository addresses the problem of the significant amount of time that developers spend searching through company documentation, or asking other developers simple questions that are in the documentation. An efficient way of solving this is building an intelligent agent with access to the company's internal documentation, so that the users can asks questions in natural language as they would interact with an experienced human. An adequate LLM pattern that can be used in this situation is called Retrieval-Augmented generation (RAG), which is an application that can answer questions about specific source information. In other words, is a sophisticated question-answering (Q&A) chatbot, that can reason about private data or data introduced after a model's cutoff date.
+
+An important issue is that, sometimes, the information used by the RAG is private, and thus has proprietary and geographical restrictions. Thus, it is useful to design a system which does not need to send information to third parties in order to maintain data privacy. For these reasons, the tools used for this project are [Ollama](https://ollama.com) (a tool to locally deploy language models), [Chroma](https://www.trychroma.com) (a tool to locally work with vector databases), [Hugging Face](https://huggingface.co) (a set of libraries to download open-source embedding or ML models), and [LlamaIndex](https://docs.llamaindex.ai/en/stable/) (a tool to build complete LLMOps pipelines). In particular, the models recommended for the deployment of this project are mixtral (a well performing LLM with a trully open-source license) and GTE (well performing embedding model with state-of-the-art training methods).
+
+This project is designed as follows:
+- High-level configurations can be made by the user just by editing `config.yaml` according to their preferences.
+- First, the documents are loaded from source (in this case, plain text files), segmented in chunks, and vectorized using a Huggingface model.
+- Then the vectors are stored in a local database, which is Chroma in this case.
+- Whenever the knowledge base is updated, the database can be updated as well with Chroma (run `python database_handler.py -u`).
+- If any user needs to query the knowledge base, the backend loads both the vector database and a large language model.
+- The query is passed through the LlamaIndex RAG pipeline and the results are returned through a RestAPI.
+- The application frontend processes both the request and the response in a friendly user interface.
+
+A simple architecture diagram looks as follows:
+
+![Architecture diagram](architecture.svg)
--- a/architecture.svg
+++ b/architecture.svg
--- a/backend.py
+++ b/backend.py
@ -0,0 +1,115 @@
+"""
+RAG pipeline accessible through RestAPI.
+
+Author
+------
+Nicolas Rojas
+"""
+
+# import libraries
+import os.path
+from json import dumps
+import yaml
+from pydantic import BaseModel
+from fastapi import FastAPI
+import chromadb
+from llama_index.core import (
+    VectorStoreIndex,
+    Settings,
+)
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.vector_stores.chroma import ChromaVectorStore
+from llama_index.llms.ollama import Ollama
+from database_handler import create_index
+
+
+class Query(BaseModel):
+    """RestAPI input structure."""
+
+    query: str
+
+
+def save_query(path: str, query: str, response: dict):
+    """Save query in persistent jsonl file.
+
+    Parameters
+    ----------
+    path : str
+        Path to the jsonl file where the query history is saved.
+    query : str
+        Query received by the RAG system.
+    response : dict
+        Dictionary with the response and relevant documents.
+    """
+    data = dict(response)
+    data["query"] = query
+    with open(path, "a", encoding="utf8") as jfile:
+        jfile.write(dumps(data, ensure_ascii=False) + "\n")
+
+
+# load configuration variables
+with open("config.yaml", "r", encoding="utf8") as yfile:
+    parameters = yaml.safe_load(yfile)
+
+index_dir = parameters["index_directory"]
+chunk_size = parameters["chunk_size"]
+embedding_model = parameters["embedding_model"]
+ollama_model = parameters["ollama_model"]
+chroma_collection = parameters["chroma_collection"]
+documents_dir = parameters["documents_dir"]
+query_history = parameters["query_history"]
+
+# Set custom RAG settings
+Settings.chunk_size = chunk_size
+Settings.embed_model = HuggingFaceEmbedding(model_name=embedding_model)
+Settings.llm = Ollama(model=ollama_model, request_timeout=360.0)
+
+# initiate FastAPI app
+app = FastAPI()
+
+# check if stored index already exists
+if not os.path.exists(index_dir):
+    create_index(chroma_collection, documents_dir, index_dir, embedding_model)
+
+# load the existing index
+chroma_client = chromadb.PersistentClient(path=index_dir)
+chroma_collection = chroma_client.get_or_create_collection(chroma_collection)
+vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
+embed_model = HuggingFaceEmbedding(model_name=embedding_model)
+index = VectorStoreIndex.from_vector_store(
+    vector_store,
+    embed_model=embed_model,
+)
+
+# define the index
+query_engine = index.as_query_engine()
+
+
+@app.post("/query/")
+def retrieve(query: Query) -> dict:
+    """Run a query with the RAG pipeline.
+
+    Parameters
+    ----------
+    query : str
+        Question asked by the user, as a string.
+
+    Returns
+    -------
+    dict
+        Dictionary containing the answer given by the LLM and the relevant
+        documents.
+    """
+    query = query.query
+    global query_engine
+    response = query_engine.query(query)
+    result = {"response": response.response}
+
+    source_files = []
+    for source_node in response.source_nodes:
+        source_files.append(source_node.node.metadata["file_name"])
+    source_files = list(set(source_files))
+    result["source_files"] = source_files
+
+    save_query(query_history, query, result)
+    return result
--- a/config.yaml
+++ b/config.yaml
@ -0,0 +1,7 @@
+index_directory: ".rag_index"
+chunk_size: 512
+embedding_model: "thenlper/gte-large"
+ollama_model: "mixtral"
+chroma_collection: "documents_chroma_db"
+documents_dir: "documents"
+query_history: "queries_database.jsonl"
--- a/database_handler.py
+++ b/database_handler.py
@ -0,0 +1,151 @@
+"""
+CLI handler to create or update the Chroma database.
+
+Author
+------
+Nicolas Rojas
+"""
+
+# import libraries
+from argparse import ArgumentParser
+import os.path
+from datetime import datetime
+import yaml
+import chromadb
+from llama_index.core import (
+    VectorStoreIndex,
+    SimpleDirectoryReader,
+    StorageContext,
+    Settings,
+)
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.vector_stores.chroma import ChromaVectorStore
+
+
+def create_index(
+    chroma_collection_name: str,
+    documents_dir: str,
+    index_dir: str,
+    embedding_model: str,
+):
+    """Create vector database from documents folder.
+
+    Parameters
+    ----------
+    chroma_collection_name : str
+        Name of the Chroma collection to be created.
+    documents_dir : str
+        Directory where the documents are stored.
+    index_dir : str
+        Directory where the index is saved.
+    embedding_model : str
+        Huggingface embedding model to vectorize the documents.
+    """
+    # create Chroma vector store
+    chroma_client = chromadb.PersistentClient(path=index_dir)
+    chroma_collection = chroma_client.get_or_create_collection(chroma_collection_name)
+    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+
+    # load the documents and create the index
+    embed_model = HuggingFaceEmbedding(model_name=embedding_model)
+    documents = SimpleDirectoryReader(documents_dir).load_data()
+    index = VectorStoreIndex.from_documents(
+        documents, storage_context=storage_context, embed_model=embed_model
+    )
+
+
+def update_index(
+    chroma_collection_name: str,
+    documents_dir: str,
+    index_dir: str,
+    embedding_model: str,
+):
+    """Update vector database with new or changed files.
+
+    Parameters
+    ----------
+    chroma_collection_name : str
+        Name of the Chroma collection to be updated.
+    documents_dir : str
+        Directory where the documents are stored.
+    index_dir : str
+        Directory where the index is saved.
+    embedding_model : str
+        Huggingface embedding model to vectorize the documents.
+    """
+    # load the existing index
+    chroma_client = chromadb.PersistentClient(path=index_dir)
+    chroma_collection = chroma_client.get_or_create_collection(chroma_collection_name)
+    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
+    embed_model = HuggingFaceEmbedding(model_name=embedding_model)
+    index = VectorStoreIndex.from_vector_store(
+        vector_store,
+        embed_model=embed_model,
+    )
+    documents = SimpleDirectoryReader(documents_dir).load_data()
+
+    for doc in documents:
+        doc_id = doc.metadata["file_path"]
+
+        # Check if document already exists in the index
+        existing_node = index.docstore.document_exists(doc_id)
+
+        if existing_node:
+            existing_mtime = datetime.fromisoformat(
+                existing_node.metadata["last_modified"]
+            )
+            current_mtime = datetime.fromtimestamp(os.path.getmtime(doc_id))
+
+            # If the file has been modified, update it
+            if current_mtime > existing_mtime:
+                index.update_ref_doc(doc_id, doc)
+        else:
+            # If the document doesn't exist, insert it
+            index.insert(doc)
+    # Persist changes
+    index.storage_context.persist(persist_dir=index_dir)
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(
+        description="Utility to manage database index with LlamaIndex and Chroma"
+    )
+    parser.add_argument(
+        "-c", "--create", action="store_true", help="Create the database"
+    )
+    parser.add_argument(
+        "-u", "--update", action="store_true", help="Update the database"
+    )
+    args = parser.parse_args()
+    # load configuration variables
+    with open("config.yaml", "r", encoding="utf8") as yfile:
+        parameters = yaml.safe_load(yfile)
+
+    index_dir = parameters["index_directory"]
+    chunk_size = parameters["chunk_size"]
+    embedding_model = parameters["embedding_model"]
+    chroma_collection = parameters["chroma_collection"]
+    documents_dir = parameters["documents_dir"]
+
+    # Set custom RAG settings
+    Settings.chunk_size = chunk_size
+    Settings.embed_model = HuggingFaceEmbedding(model_name=embedding_model)
+
+    # if both arguments are true or both are false, throw error
+    if args.create == args.update:
+        raise ValueError(
+            "Use the program with argument -c OR -u. Use flag -h for help."
+        )
+
+    # create database if doesnt exist yet
+    if args.create:
+        if os.path.exists(index_dir):
+            raise FileExistsError(f"The file {index_dir} already exists")
+        create_index(chroma_collection, documents_dir, index_dir, embedding_model)
+
+    # update existing database
+    elif args.update:
+        if not os.path.exists(index_dir):
+            raise FileNotFoundError(f"The file {index_dir} does not exist")
+        update_index(chroma_collection, documents_dir, index_dir, embedding_model)
--- a/frontend.py
+++ b/frontend.py
@ -0,0 +1,54 @@
+"""
+Web application to interact with the RAG system.
+
+Author
+------
+Nicolas Rojas
+"""
+
+import requests
+import gradio as gr
+
+API_URL = "http://localhost:8000/query/"
+
+
+def query_model(text: str) -> str:
+    """Query the RAG system through its API.
+
+    Parameters
+    ----------
+    text : str
+        Question or query made by the user.
+
+    Returns
+    -------
+    str
+        Response given by the API as a single string, or error logs when found.
+    """
+    try:
+        # Send a POST request to the API
+        response = requests.post(API_URL, json={"query": text}, timeout=30)
+
+        # Check whether the request was successful
+        if response.status_code != 200:
+            return f"Error: API returned status code {response.status_code}"
+        result = response.json()["response"]
+        result += "\nThis information was obtained from the following files:\n"
+        for source in response.json()["source_files"]:
+            result += f"- {source}\n"
+        return result
+    except requests.RequestException as e:
+        return f"Error: Could not connect to the API. {str(e)}"
+
+
+# Create the Gradio interface
+interface = gr.Interface(
+    fn=query_model,
+    inputs=gr.Textbox(lines=5, placeholder="Enter text here..."),
+    outputs="text",
+    title="Docuchat",
+    description="Enter your question, and I'll provide with an answer from the documentation.",
+)
+
+# Launch the app
+interface.launch()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,8 @@
+llama-index-core == 0.11.8
+llama-index-readers-file == 0.2.1
+llama-index-llms-ollama == 0.3.1
+llama-index-embeddings-huggingface == 0.3.1
+llama-index-vector-stores-chroma == 0.2.0
+fastapi>=0.68.0
+uvicorn>=0.15.0
+gradio==4.29.0
--- a/setup.sh
+++ b/setup.sh
@ -0,0 +1,4 @@
+pip install -r requirements.txt
+curl -fsSL https://ollama.com/install.sh | sh
+python database_handler.py -c
+ollama pull mixtral