Skip to content

Commit

Permalink
feat: started on preprocessing, fetch pdb util, loggers
Browse files Browse the repository at this point in the history
  • Loading branch information
trevorpfiz committed Oct 15, 2024
1 parent aeccb72 commit 9fcf79d
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 12 deletions.
1 change: 1 addition & 0 deletions apps/fastapi/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies = [
"aiofiles>=24.1.0",
"boto3>=1.35.29",
"fastapi[standard]>=0.115.0",
"httpx>=0.27.2",
"huggingface-hub>=0.25.1",
"mangum>=0.19.0",
"openai>=1.50.2",
Expand Down
52 changes: 47 additions & 5 deletions apps/fastapi/src/app/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@
import logging
import re
from io import StringIO
from typing import List, Type, TypeVar
from typing import List, Optional, Type, TypeVar

import boto3
import httpx
from fastapi import HTTPException
from pydantic import BaseModel, ValidationError

from app.core.config import settings

logger = logging.getLogger(__name__)

# Create a TypeVar to represent the schema type
T = TypeVar("T", bound=BaseModel)

Expand Down Expand Up @@ -41,13 +45,13 @@ def read_s3_csv(
validated_row = schema.model_validate(row)
rows.append(validated_row)
except (ValidationError, ValueError) as e:
logging.error(f"Validation error in row {row}: {e}")
logger.error(f"Validation error in row {row}: {e}")
continue
else:
rows.append(row)

except Exception as e:
logging.error(f"Error reading CSV from S3: {bucket}/{s3_key} {e}")
logger.error(f"Error reading CSV from S3: {bucket}/{s3_key} {e}")
return []

return rows
Expand Down Expand Up @@ -75,7 +79,7 @@ def get_endpoints(endpoint_name_filter, sagemaker_client=None):
for i in sagemaker_endpoints_for_service:
endpoints.append(i["EndpointName"])
if len(endpoints) == 0:
print(f"No Endpoints found for filter: {endpoint_name_filter}")
logger.warning(f"No Endpoints found for filter: {endpoint_name_filter}")
return endpoints


Expand All @@ -99,7 +103,7 @@ def get_models_on_endpoint(model_data_prefix):
]
return models_on_endpoint
except Exception as e:
logging.error(f" --- Failed to get models on endpoint ---- {e}")
logger.error(f"Failed to get models on endpoint: {e}")
models_on_endpoint = []
return models_on_endpoint

Expand Down Expand Up @@ -127,3 +131,41 @@ def clean_string(s):
return re.sub(r"[\W_]+", "", s)
else:
return s


async def fetch_pdb_data(pdb_id: str, chain: Optional[str] = None) -> dict:
"""
Fetches polymer entity instance data from the RCSB PDB REST API.
Args:
pdb_id (str): The PDB ID.
chain (Optional[str]): The chain ID.
Returns:
dict: Parsed JSON response from the API.
Raises:
HTTPException: If the PDB entry or chain is not found or other errors occur.
"""
if chain:
url = f"https://data.rcsb.org/rest/v1/core/polymer_entity_instance/{pdb_id}/{chain}"
else:
url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
headers = {"Accept": "application/json"}

async with httpx.AsyncClient() as client:
response = await client.get(url, headers=headers)
if response.status_code == 200:
return response.json()
elif response.status_code == 404:
detail = (
f"PDB entry {pdb_id} with chain {chain} not found."
if chain
else f"PDB entry {pdb_id} not found."
)
raise HTTPException(status_code=404, detail=detail)
else:
raise HTTPException(
status_code=response.status_code,
detail=f"Error fetching PDB data: {response.status_code}",
)
5 changes: 4 additions & 1 deletion apps/fastapi/src/app/services/background_tasks.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from typing import Optional

from fastapi import HTTPException
Expand All @@ -15,6 +16,8 @@
process_mhc_ii_prediction,
)

logger = logging.getLogger(__name__)


async def process_and_update_prediction(
job_id: str,
Expand Down Expand Up @@ -66,7 +69,7 @@ async def process_and_update_prediction(
await crud_job.update_status(db=db, id=job_id, status="completed")
except Exception as e:
# Log the error (consider using a logger)
print(f"Error processing prediction {job_id}: {e}")
logger.error(f"Error processing prediction {job_id}: {e}")

# Update job status to 'failed'
await crud_job.update_status(db=db, id=job_id, status="failed")
3 changes: 3 additions & 0 deletions apps/fastapi/src/app/services/postprocess.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from typing import List, Optional

from fastapi import HTTPException
Expand All @@ -9,6 +10,8 @@
from app.schemas.mhc_i_prediction import MhcIPredictionResult
from app.schemas.mhc_ii_prediction import MhcIIPredictionResult

logger = logging.getLogger(__name__)


async def process_conformational_b_prediction(
pdb_id: Optional[str] = None,
Expand Down
113 changes: 107 additions & 6 deletions apps/fastapi/src/app/services/preprocess.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,116 @@
import logging
from typing import Any, Dict, Optional

from app.core.config import settings
from fastapi import HTTPException

logging.basicConfig(
level=settings.LOG_LEVEL,
format="%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)

"""
Preprocessing functions
"""


def extract_sequence(pdb_data: dict) -> str:
"""
Extracts the amino acid sequence from the PDB data.
Args:
pdb_data (dict): JSON response from the RCSB API.
Returns:
str: Amino acid sequence.
Raises:
HTTPException: If the sequence data is missing.
"""
try:
if "rcsb_polymer_entity_instance" in pdb_data:
sequence = pdb_data["rcsb_polymer_entity_instance"]["entity"][
"rcsb_polymer_entity"
]["rcsb_polymer"]["sequence"]
else:
# Handle case when chain is not provided
sequence = ""
entities = pdb_data.get("rcsb_entry_container_identifiers", {}).get(
"polymer_entity_ids", []
)
for entity_id in entities:
entity = pdb_data["polymer_entities"][entity_id]
sequence += entity["rcsb_polymer"]["sequence"]
if not sequence:
raise KeyError
return sequence
except KeyError:
logger.error("Amino acid sequence not found in the PDB data.")
raise HTTPException(
status_code=400, detail="Amino acid sequence not found in the PDB data."
)


def extract_structure(pdb_data: dict) -> Optional[Dict[str, Any]]:
"""
Extracts structural information from the PDB data.
Args:
pdb_data (dict): JSON response from the RCSB API.
Returns:
Optional[Dict[str, Any]]: Structural coordinates and other relevant data or None if not available.
Raises:
HTTPException: If structural data is missing when required.
"""
try:
if "rcsb_polymer_entity_instance" in pdb_data:
atoms = pdb_data["rcsb_polymer_entity_instance"]["entity"][
"rcsb_polymer_entity"
]["rcsb_polymer"]["atom_sites"]
return {"atoms": atoms}
else:
# Structure data might not be needed if chain is not provided
return None
except KeyError:
logger.error("Structural data not found in the PDB data.")
if "rcsb_polymer_entity_instance" in pdb_data:
raise HTTPException(
status_code=400, detail="Structural data not found in the PDB data."
)
return None


def prepare_payload(
sequence: str, structure: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
"""
Prepares the payload for prediction based on available data.
Args:
sequence (str): Amino acid sequence.
structure (Optional[Dict[str, Any]]): Structural data.
Returns:
Dict[str, Any]: Payload for prediction.
"""
payload = {
"sequence": sequence,
}
if structure:
payload["structure"] = structure
return payload


def validate_pdb_data(pdb_data: dict, chain: Optional[str] = None) -> Dict[str, Any]:
"""
Validates and extracts necessary data from PDB response.
Args:
pdb_data (dict): JSON response from the RCSB API.
chain (Optional[str]): Chain ID if provided.
Returns:
Dict[str, Any]: Extracted data including sequence and structure.
"""
sequence = extract_sequence(pdb_data)
structure = extract_structure(pdb_data) if chain else None
return {"sequence": sequence, "structure": structure}
2 changes: 2 additions & 0 deletions apps/fastapi/uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 9fcf79d

Please sign in to comment.