Skip to content

Commit

Permalink
UPDATE
Browse files Browse the repository at this point in the history
  • Loading branch information
mazzasaverio committed Mar 29, 2024
1 parent f898c2b commit 367e9e6
Show file tree
Hide file tree
Showing 9 changed files with 1,091 additions and 4 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
frontend/*
db_docker
*.tfvars*

*terraform.tfstate*
note_varie
custom_tree_and_files_corrected.txt
chapter-journal
Expand Down
Empty file added backend/__init__.py
Empty file.
5 changes: 3 additions & 2 deletions backend/app/core/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pydantic_settings import BaseSettings, SettingsConfigDict
from pydantic_settings import BaseSettings
from typing import List
from loguru import logger
from backend.logging_config import logger

import sys


Expand Down
2 changes: 2 additions & 0 deletions backend/ingestion/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
PATH_RAW_PDF: "data/raw/pdf/"
COLLECTION_NAME: "pdf"
46 changes: 46 additions & 0 deletions backend/ingestion/helpers/embedding_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# -*- coding: utf-8 -*-
# mypy: disable-error-code="call-arg"
# TODO: Change langchain param names to match the new langchain version

import logging
from typing import List, Optional, Union

from langchain.embeddings import CacheBackedEmbeddings
from langchain_openai.embeddings import OpenAIEmbeddings

class CacheBackedEmbeddingsExtended(CacheBackedEmbeddings):
def embed_query(self, text: str) -> List[float]:
"""
Embed query text.
Extended to support caching
Args:
text: The text to embed.
Returns:
The embedding for the given text.
"""
vectors: List[Union[List[float], None]] = self.document_embedding_store.mget([text])
text_embeddings = vectors[0]

if text_embeddings is None:
text_embeddings = self.underlying_embeddings.embed_query(text)
self.document_embedding_store.mset(list(zip([text], [text_embeddings])))

return text_embeddings


def get_embedding_model(emb_model: Optional[str]) -> CacheBackedEmbeddings:
"""
Get the embedding model from the embedding model type.
"""
underlying_embeddings = OpenAIEmbeddings()

embedder = CacheBackedEmbeddingsExtended(underlying_embeddings)

# store = get_redis_store()
# embedder = CacheBackedEmbeddingsExtended.from_bytes_store(
# underlying_embeddings, store, namespace=underlying_embeddings.model
# )
return embedder
65 changes: 65 additions & 0 deletions backend/ingestion/pdf_ingestion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import sys
import os

# Temporary solution.It is used to predict the centralization of logs in the future
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))


import yaml
import os

from dotenv import load_dotenv
from langchain.vectorstores.pgvector import PGVector
from langchain.embeddings import CacheBackedEmbeddings
from backend.logging_config import logger


# from helpers.embedding_models import get_embedding_model

load_dotenv()

ingestion_config = yaml.load(
open("backend/ingestion/config.yaml"), Loader=yaml.FullLoader
)

path_raw_pdf = ingestion_config.get("PATH_RAW_PDF", None)
collection_name = ingestion_config.get("COLLECTION_NAME", None)
db_name = os.getenv("DB_NAME")

DATABASE_HOST = os.getenv("DATABASE_HOST")
DATABASE_PORT = os.getenv("DATABASE_PORT")
DATABASE_USER = os.getenv("DATABASE_USER")
DATABASE_PASSWORD = os.getenv("DATABASE_PASSWORD")


class PDFExtractionPipeline:
"""Pipeline for extracting text from PDFs and loading them into a vector store."""

db: PGVector | None = None
embedding: CacheBackedEmbeddings

def __init__(self):
logger.info("Initializing PDFExtractionPipeline")
# self.embedding_model = get_embedding_model()

self.connection_str = PGVector.connection_string_from_db_params(
driver="psycopg2",
host=DATABASE_HOST,
port=DATABASE_PORT,
database=db_name,
user=DATABASE_USER,
password=DATABASE_PASSWORD,
)
logger.debug(f"Connection string: {self.connection_str}")

def run(self, collection_name: str):
logger.info(f"Running extraction pipeline for collection: {collection_name}")
# Example method to demonstrate usage
pass


# Example usage
if __name__ == "__main__":
logger.info("Starting PDF extraction pipeline")
pipeline = PDFExtractionPipeline()
pipeline.run(collection_name)
10 changes: 10 additions & 0 deletions backend/logging_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from loguru import logger
import sys

def setup_logging():
logger.remove() # Remove default handler
# Configure Loguru logger to output to stderr with the desired format and level
logger.add(sys.stderr, format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | <level>{message}</level>", level="DEBUG")

# Call the setup function to ensure logging is configured when the module is imported
setup_logging()
963 changes: 962 additions & 1 deletion backend/poetry.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ loguru = "^0.7.2"
pydantic-settings = "^2.2.1"
asyncpg = "^0.29.0"
sqlmodel = "^0.0.16"
pyyaml = "^6.0.1"
langchain = "^0.1.13"


[build-system]
Expand Down

0 comments on commit 367e9e6

Please sign in to comment.