UPDATE

mazzasaverio · Mar 29, 2024 · 367e9e6 · 367e9e6
1 parent f898c2b
commit 367e9e6
Show file tree

Hide file tree

Showing 9 changed files with 1,091 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,7 +3,7 @@
 frontend/*
 db_docker
 *.tfvars*
-
+*terraform.tfstate*
 note_varie
 custom_tree_and_files_corrected.txt
 chapter-journal

diff --git a/backend/__init__.py b/backend/__init__.py
diff --git a/backend/app/core/config.py b/backend/app/core/config.py
@@ -1,6 +1,7 @@
-from pydantic_settings import BaseSettings, SettingsConfigDict
+from pydantic_settings import BaseSettings
 from typing import List
-from loguru import logger
+from backend.logging_config import logger
+
 import sys
 
 

diff --git a/backend/ingestion/config.yaml b/backend/ingestion/config.yaml
@@ -0,0 +1,2 @@
+PATH_RAW_PDF: "data/raw/pdf/"
+COLLECTION_NAME: "pdf"
diff --git a/backend/ingestion/helpers/embedding_models.py b/backend/ingestion/helpers/embedding_models.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+# mypy: disable-error-code="call-arg"
+# TODO: Change langchain param names to match the new langchain version
+
+import logging
+from typing import List, Optional, Union
+
+from langchain.embeddings import CacheBackedEmbeddings
+from langchain_openai.embeddings import OpenAIEmbeddings
+
+class CacheBackedEmbeddingsExtended(CacheBackedEmbeddings):
+    def embed_query(self, text: str) -> List[float]:
+        """
+        Embed query text.
+
+        Extended to support caching
+
+        Args:
+            text: The text to embed.
+
+        Returns:
+            The embedding for the given text.
+        """
+        vectors: List[Union[List[float], None]] = self.document_embedding_store.mget([text])
+        text_embeddings = vectors[0]
+
+        if text_embeddings is None:
+            text_embeddings = self.underlying_embeddings.embed_query(text)
+            self.document_embedding_store.mset(list(zip([text], [text_embeddings])))
+
+        return text_embeddings
+
+
+def get_embedding_model(emb_model: Optional[str]) -> CacheBackedEmbeddings:
+    """
+    Get the embedding model from the embedding model type.
+    """
+    underlying_embeddings = OpenAIEmbeddings()
+
+    embedder = CacheBackedEmbeddingsExtended(underlying_embeddings)
+
+    # store = get_redis_store()
+    # embedder = CacheBackedEmbeddingsExtended.from_bytes_store(
+    #     underlying_embeddings, store, namespace=underlying_embeddings.model
+    # )
+    return embedder
diff --git a/backend/ingestion/pdf_ingestion.py b/backend/ingestion/pdf_ingestion.py
@@ -0,0 +1,65 @@
+import sys
+import os
+
+# Temporary solution.It is used to predict the centralization of logs in the future
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+
+
+import yaml
+import os
+
+from dotenv import load_dotenv
+from langchain.vectorstores.pgvector import PGVector
+from langchain.embeddings import CacheBackedEmbeddings
+from backend.logging_config import logger
+
+
+# from helpers.embedding_models import get_embedding_model
+
+load_dotenv()
+
+ingestion_config = yaml.load(
+    open("backend/ingestion/config.yaml"), Loader=yaml.FullLoader
+)
+
+path_raw_pdf = ingestion_config.get("PATH_RAW_PDF", None)
+collection_name = ingestion_config.get("COLLECTION_NAME", None)
+db_name = os.getenv("DB_NAME")
+
+DATABASE_HOST = os.getenv("DATABASE_HOST")
+DATABASE_PORT = os.getenv("DATABASE_PORT")
+DATABASE_USER = os.getenv("DATABASE_USER")
+DATABASE_PASSWORD = os.getenv("DATABASE_PASSWORD")
+
+
+class PDFExtractionPipeline:
+    """Pipeline for extracting text from PDFs and loading them into a vector store."""
+
+    db: PGVector | None = None
+    embedding: CacheBackedEmbeddings
+
+    def __init__(self):
+        logger.info("Initializing PDFExtractionPipeline")
+        # self.embedding_model = get_embedding_model()
+
+        self.connection_str = PGVector.connection_string_from_db_params(
+            driver="psycopg2",
+            host=DATABASE_HOST,
+            port=DATABASE_PORT,
+            database=db_name,
+            user=DATABASE_USER,
+            password=DATABASE_PASSWORD,
+        )
+        logger.debug(f"Connection string: {self.connection_str}")
+
+    def run(self, collection_name: str):
+        logger.info(f"Running extraction pipeline for collection: {collection_name}")
+        # Example method to demonstrate usage
+        pass
+
+
+# Example usage
+if __name__ == "__main__":
+    logger.info("Starting PDF extraction pipeline")
+    pipeline = PDFExtractionPipeline()
+    pipeline.run(collection_name)
diff --git a/backend/logging_config.py b/backend/logging_config.py
@@ -0,0 +1,10 @@
+from loguru import logger
+import sys
+
+def setup_logging():
+    logger.remove()  # Remove default handler
+    # Configure Loguru logger to output to stderr with the desired format and level
+    logger.add(sys.stderr, format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | <level>{message}</level>", level="DEBUG")
+
+# Call the setup function to ensure logging is configured when the module is imported
+setup_logging()
diff --git a/backend/poetry.lock b/backend/poetry.lock
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
@@ -15,6 +15,8 @@ loguru = "^0.7.2"
 pydantic-settings = "^2.2.1"
 asyncpg = "^0.29.0"
 sqlmodel = "^0.0.16"
+pyyaml = "^6.0.1"
+langchain = "^0.1.13"
 
 
 [build-system]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		PATH_RAW_PDF: "data/raw/pdf/"
		COLLECTION_NAME: "pdf"