-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f898c2b
commit 367e9e6
Showing
9 changed files
with
1,091 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
PATH_RAW_PDF: "data/raw/pdf/" | ||
COLLECTION_NAME: "pdf" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# -*- coding: utf-8 -*- | ||
# mypy: disable-error-code="call-arg" | ||
# TODO: Change langchain param names to match the new langchain version | ||
|
||
import logging | ||
from typing import List, Optional, Union | ||
|
||
from langchain.embeddings import CacheBackedEmbeddings | ||
from langchain_openai.embeddings import OpenAIEmbeddings | ||
|
||
class CacheBackedEmbeddingsExtended(CacheBackedEmbeddings): | ||
def embed_query(self, text: str) -> List[float]: | ||
""" | ||
Embed query text. | ||
Extended to support caching | ||
Args: | ||
text: The text to embed. | ||
Returns: | ||
The embedding for the given text. | ||
""" | ||
vectors: List[Union[List[float], None]] = self.document_embedding_store.mget([text]) | ||
text_embeddings = vectors[0] | ||
|
||
if text_embeddings is None: | ||
text_embeddings = self.underlying_embeddings.embed_query(text) | ||
self.document_embedding_store.mset(list(zip([text], [text_embeddings]))) | ||
|
||
return text_embeddings | ||
|
||
|
||
def get_embedding_model(emb_model: Optional[str]) -> CacheBackedEmbeddings: | ||
""" | ||
Get the embedding model from the embedding model type. | ||
""" | ||
underlying_embeddings = OpenAIEmbeddings() | ||
|
||
embedder = CacheBackedEmbeddingsExtended(underlying_embeddings) | ||
|
||
# store = get_redis_store() | ||
# embedder = CacheBackedEmbeddingsExtended.from_bytes_store( | ||
# underlying_embeddings, store, namespace=underlying_embeddings.model | ||
# ) | ||
return embedder |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import sys | ||
import os | ||
|
||
# Temporary solution.It is used to predict the centralization of logs in the future | ||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) | ||
|
||
|
||
import yaml | ||
import os | ||
|
||
from dotenv import load_dotenv | ||
from langchain.vectorstores.pgvector import PGVector | ||
from langchain.embeddings import CacheBackedEmbeddings | ||
from backend.logging_config import logger | ||
|
||
|
||
# from helpers.embedding_models import get_embedding_model | ||
|
||
load_dotenv() | ||
|
||
ingestion_config = yaml.load( | ||
open("backend/ingestion/config.yaml"), Loader=yaml.FullLoader | ||
) | ||
|
||
path_raw_pdf = ingestion_config.get("PATH_RAW_PDF", None) | ||
collection_name = ingestion_config.get("COLLECTION_NAME", None) | ||
db_name = os.getenv("DB_NAME") | ||
|
||
DATABASE_HOST = os.getenv("DATABASE_HOST") | ||
DATABASE_PORT = os.getenv("DATABASE_PORT") | ||
DATABASE_USER = os.getenv("DATABASE_USER") | ||
DATABASE_PASSWORD = os.getenv("DATABASE_PASSWORD") | ||
|
||
|
||
class PDFExtractionPipeline: | ||
"""Pipeline for extracting text from PDFs and loading them into a vector store.""" | ||
|
||
db: PGVector | None = None | ||
embedding: CacheBackedEmbeddings | ||
|
||
def __init__(self): | ||
logger.info("Initializing PDFExtractionPipeline") | ||
# self.embedding_model = get_embedding_model() | ||
|
||
self.connection_str = PGVector.connection_string_from_db_params( | ||
driver="psycopg2", | ||
host=DATABASE_HOST, | ||
port=DATABASE_PORT, | ||
database=db_name, | ||
user=DATABASE_USER, | ||
password=DATABASE_PASSWORD, | ||
) | ||
logger.debug(f"Connection string: {self.connection_str}") | ||
|
||
def run(self, collection_name: str): | ||
logger.info(f"Running extraction pipeline for collection: {collection_name}") | ||
# Example method to demonstrate usage | ||
pass | ||
|
||
|
||
# Example usage | ||
if __name__ == "__main__": | ||
logger.info("Starting PDF extraction pipeline") | ||
pipeline = PDFExtractionPipeline() | ||
pipeline.run(collection_name) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
from loguru import logger | ||
import sys | ||
|
||
def setup_logging(): | ||
logger.remove() # Remove default handler | ||
# Configure Loguru logger to output to stderr with the desired format and level | ||
logger.add(sys.stderr, format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | <level>{message}</level>", level="DEBUG") | ||
|
||
# Call the setup function to ensure logging is configured when the module is imported | ||
setup_logging() |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters