-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add index on create time/improve query performance #92
Labels
Comments
Should really do some query profiling with a test database. |
Plan of action:
|
So in this test setup even with 1 million files query time is just around 0.5 seconds: from sqlalchemy import create_engine, select
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
from sqlalchemy import ForeignKey, String, Integer
class Base(DeclarativeBase):
pass
class File(Base):
__tablename__ = 'files'
name: Mapped[str] = mapped_column(primary_key=True)
size: Mapped[int] = mapped_column(Integer)
remote_instances: Mapped[list["RemoteInstance"]] = relationship("RemoteInstance")
class RemoteInstance(Base):
__tablename__ = 'remote_instances'
id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
librarian_id: Mapped[int] = mapped_column()
file_name: Mapped[str] = mapped_column(ForeignKey('files.name'))
file: Mapped[File] = relationship(File)
class OutgoingTransfer(Base):
__tablename__ = 'outgoing_transfers'
id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
file_name: Mapped[str] = mapped_column(ForeignKey('files.name'))
file: Mapped[File] = relationship(File)
status: Mapped[str] = mapped_column(String)
destination: Mapped[str] = mapped_column()
# Use testcontainers to spin up postgres
from testcontainers.postgres import PostgresContainer
postgres = PostgresContainer()
postgres.start()
# Create the tables
engine = create_engine(postgres.get_connection_url().replace("psycopg2", "psycopg"))
Base.metadata.create_all(engine)
# Insert some data
N_FILES = 100_000_0
FRACTION_WITH_REMOTE_INSTANCES = 0.995
FRACTION_WITH_OUTGOING_TRANSFERS = 0.004
PLATFORMS = ["satp1", "satp2", "satp3", "lat", "site"]
import random
FILES = [
File(
name=f"{PLATFORMS[x % len(PLATFORMS)]}/{x}/file.txt",
size=random.randint(1, 1000),
)
for x in range(N_FILES)
]
REMOTE_INSTANCES = [
RemoteInstance(
file_name=FILES[x].name,
librarian_id=1,
)
for x in range(int(N_FILES * FRACTION_WITH_REMOTE_INSTANCES))
]
OUTGOING_TRANSFERS = [
OutgoingTransfer(
file_name=FILES[x].name,
status="COMPLETE",
destination="external_librarian"
)
for x in range(int(N_FILES * (FRACTION_WITH_OUTGOING_TRANSFERS + FRACTION_WITH_REMOTE_INSTANCES)))
]
for x in range(int(N_FILES * FRACTION_WITH_REMOTE_INSTANCES), int(N_FILES * (FRACTION_WITH_OUTGOING_TRANSFERS + FRACTION_WITH_REMOTE_INSTANCES))):
OUTGOING_TRANSFERS[x].status = "IN_PROGRESS"
from sqlalchemy.orm import sessionmaker
Session = sessionmaker(bind=engine)
with Session() as session:
session.add_all(FILES)
session.add_all(REMOTE_INSTANCES)
session.add_all(OUTGOING_TRANSFERS)
session.commit()
def run_query(session):
file_stmt = select(File)
remote_instances_stmt = select(RemoteInstance.file_name).filter(
RemoteInstance.librarian_id == 1
)
outgoing_transfer_stmt = (
select(OutgoingTransfer.file_name)
.filter(OutgoingTransfer.destination == "external_librarian")
.filter(
OutgoingTransfer.status.in_(
[
"IN_PROGRESS",
]
)
)
)
file_stmt = file_stmt.where(File.name.not_in(remote_instances_stmt))
file_stmt = file_stmt.where(File.name.not_in(outgoing_transfer_stmt))
files_without_remote_instances: list[File] = (
session.execute(file_stmt).scalars().all()
)
return len(files_without_remote_instances)
# Run the query
import time
start = time.time()
with Session() as session:
print(run_query(session))
print("Time taken:", time.time() - start) |
JBorrow
changed the title
Add index on create time
Add index on create time/improve query performance
Nov 12, 2024
Logging added in #112 so we can now trace the speed of these queries in prod |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Querying for send clones is slow
The text was updated successfully, but these errors were encountered: