Protein Target Library — Design Document¶
Overview¶
The Protein Target Library module enables users to import, browse, and manage protein targets and their 3D structures. It integrates with external databases (UniProt, RCSB PDB, AlphaFold DB) to fetch sequence and structural data, and provides tools for binding site detection, sequence alignment, and structural alignment.
This module is the foundation for all downstream drug discovery tasks: you need a protein target before you can dock compounds against it.
Click diagram to zoom and pan:
Database Schema¶
Click diagram to zoom and pan:
Entity-Relationship Diagram¶
┌────────────────────┐ ┌───────────────────────┐
│ ProteinTarget │ │ ProteinStructure │
├────────────────────┤ ├───────────────────────┤
│ id (PK) │ │ id (PK) │
│ uniprot_id (UQ,IX) │◀──┐ │ protein_target_id (FK)│──▶ ProteinTarget.id
│ pdb_ids (JSON) │ │ │ pdb_id │
│ name │ │ │ source │ (rcsb/alphafold/user_upload)
│ gene_name │ │ │ file_format │ (pdb/mmcif)
│ organism │ │ │ structure_data (Text) │ (full PDB/mmCIF content)
│ description (Text) │ │ │ resolution │
│ sequence (Text) │ │ │ method │ (xray/nmr/cryo_em/predicted)
│ sequence_length │ │ │ chains (JSON) │
│ ec_number │ │ │ has_ligand │
│ function_desc(Text)│ │ │ ligand_ids (JSON) │
│ created_at │ │ │ created_at │
│ updated_at │ │ └───────────────────────┘
└────────────────────┘ │
│ 1:N
▼
┌───────────────────────┐
│ BindingSite │
├───────────────────────┤
│ id (PK) │
│ protein_structure_id │──▶ ProteinStructure.id
│ name │
│ center_x (Float) │
│ center_y (Float) │
│ center_z (Float) │
│ box_size_x (Float) │
│ box_size_y (Float) │
│ box_size_z (Float) │
│ residues (JSON) │ list of "chain:resnum"
│ druggability_score │
│ volume (Float) │
│ detection_method │ (fpocket/manual/ligand_based)
│ created_at │
└───────────────────────┘
┌───────────────────────────┐
│ SequenceAlignment │
├───────────────────────────┤
│ id (PK) │
│ name │
│ alignment_type │ (pairwise/multiple)
│ method │ (biopython/mafft/clustalo)
│ input_sequences (JSON) │ list of {id, name, sequence}
│ alignment_data (Text) │ aligned FASTA format
│ score (Float, nullable) │
│ identity_pct (Float, null)│
│ num_sequences (Int) │
│ created_at │
└───────────────────────────┘
┌───────────────────────────┐
│ StructuralAlignment │
├───────────────────────────┤
│ id (PK) │
│ name │
│ structure1_id (FK) │──▶ ProteinStructure.id
│ structure2_id (FK) │──▶ ProteinStructure.id
│ method │ (tmalign/superimposer)
│ tm_score (Float) │
│ rmsd (Float) │
│ aligned_length (Int) │
│ rotation_matrix (JSON) │ 3x3 matrix as nested list
│ translation_vector (JSON) │ [x, y, z]
│ aligned_pdb_data (Text) │ transformed PDB for overlay
│ created_at │
└───────────────────────────┘
ORM Models¶
chemlib/models/protein.py¶
from __future__ import annotations
from datetime import datetime
from typing import Optional
from sqlalchemy import String, Text, Integer, Float, Boolean, DateTime, ForeignKey, JSON, Index
from sqlalchemy.orm import Mapped, mapped_column, relationship
from sqlalchemy.sql import func
from chemlib.models.base import Base
class ProteinTarget(Base):
__tablename__ = "protein_targets"
id: Mapped[int] = mapped_column(primary_key=True)
uniprot_id: Mapped[Optional[str]] = mapped_column(
String(20), unique=True, index=True, nullable=True
)
pdb_ids: Mapped[Optional[list]] = mapped_column(JSON, nullable=True) # ["1M17", "3W2S"]
name: Mapped[str] = mapped_column(String(500), nullable=False)
gene_name: Mapped[Optional[str]] = mapped_column(String(100), nullable=True)
organism: Mapped[Optional[str]] = mapped_column(String(200), nullable=True)
description: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
sequence: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
sequence_length: Mapped[Optional[int]] = mapped_column(Integer, nullable=True)
ec_number: Mapped[Optional[str]] = mapped_column(String(50), nullable=True)
function_description: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now()
)
updated_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), onupdate=func.now()
)
# Relationships
structures: Mapped[list[ProteinStructure]] = relationship(
back_populates="protein_target", cascade="all, delete-orphan"
)
def __repr__(self) -> str:
return f"<ProteinTarget(id={self.id}, name='{self.name}', uniprot_id='{self.uniprot_id}')>"
class ProteinStructure(Base):
__tablename__ = "protein_structures"
id: Mapped[int] = mapped_column(primary_key=True)
protein_target_id: Mapped[int] = mapped_column(
ForeignKey("protein_targets.id", ondelete="CASCADE"), nullable=False, index=True
)
pdb_id: Mapped[Optional[str]] = mapped_column(String(10), nullable=True, index=True)
source: Mapped[str] = mapped_column(
String(20), nullable=False # "rcsb", "alphafold", "user_upload"
)
file_format: Mapped[str] = mapped_column(
String(10), nullable=False, default="pdb" # "pdb", "mmcif"
)
structure_data: Mapped[str] = mapped_column(
Text, nullable=False # Full PDB or mmCIF file content
)
resolution: Mapped[Optional[float]] = mapped_column(Float, nullable=True)
method: Mapped[Optional[str]] = mapped_column(
String(20), nullable=True # "xray", "nmr", "cryo_em", "predicted"
)
chains: Mapped[Optional[list]] = mapped_column(JSON, nullable=True) # ["A", "B"]
has_ligand: Mapped[bool] = mapped_column(Boolean, default=False)
ligand_ids: Mapped[Optional[list]] = mapped_column(JSON, nullable=True) # ["ATP", "MG"]
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now()
)
# Relationships
protein_target: Mapped[ProteinTarget] = relationship(back_populates="structures")
binding_sites: Mapped[list[BindingSite]] = relationship(
back_populates="protein_structure", cascade="all, delete-orphan"
)
def __repr__(self) -> str:
return f"<ProteinStructure(id={self.id}, pdb_id='{self.pdb_id}', source='{self.source}')>"
class BindingSite(Base):
__tablename__ = "binding_sites"
id: Mapped[int] = mapped_column(primary_key=True)
protein_structure_id: Mapped[int] = mapped_column(
ForeignKey("protein_structures.id", ondelete="CASCADE"), nullable=False, index=True
)
name: Mapped[str] = mapped_column(String(200), nullable=False)
center_x: Mapped[float] = mapped_column(Float, nullable=False)
center_y: Mapped[float] = mapped_column(Float, nullable=False)
center_z: Mapped[float] = mapped_column(Float, nullable=False)
box_size_x: Mapped[float] = mapped_column(Float, nullable=False, default=20.0)
box_size_y: Mapped[float] = mapped_column(Float, nullable=False, default=20.0)
box_size_z: Mapped[float] = mapped_column(Float, nullable=False, default=20.0)
residues: Mapped[Optional[list]] = mapped_column(
JSON, nullable=True # ["A:GLU45", "A:ASP52", "A:LYS721"]
)
druggability_score: Mapped[Optional[float]] = mapped_column(Float, nullable=True)
volume: Mapped[Optional[float]] = mapped_column(Float, nullable=True) # in cubic angstroms
detection_method: Mapped[str] = mapped_column(
String(20), nullable=False, default="manual" # "fpocket", "manual", "ligand_based"
)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now()
)
# Relationships
protein_structure: Mapped[ProteinStructure] = relationship(back_populates="binding_sites")
def __repr__(self) -> str:
return f"<BindingSite(id={self.id}, name='{self.name}', method='{self.detection_method}')>"
chemlib/models/alignment.py¶
from __future__ import annotations
from datetime import datetime
from typing import Optional
from sqlalchemy import String, Text, Integer, Float, DateTime, ForeignKey, JSON
from sqlalchemy.orm import Mapped, mapped_column, relationship
from sqlalchemy.sql import func
from chemlib.models.base import Base
class SequenceAlignment(Base):
__tablename__ = "sequence_alignments"
id: Mapped[int] = mapped_column(primary_key=True)
name: Mapped[str] = mapped_column(String(300), nullable=False)
alignment_type: Mapped[str] = mapped_column(
String(20), nullable=False # "pairwise", "multiple"
)
method: Mapped[str] = mapped_column(
String(20), nullable=False # "biopython", "mafft", "clustalo"
)
input_sequences: Mapped[list] = mapped_column(
JSON, nullable=False
# [{"id": "P00533", "name": "EGFR_HUMAN", "sequence": "MRPSG..."}, ...]
)
alignment_data: Mapped[str] = mapped_column(
Text, nullable=False # Aligned FASTA format
)
score: Mapped[Optional[float]] = mapped_column(Float, nullable=True)
identity_pct: Mapped[Optional[float]] = mapped_column(Float, nullable=True)
num_sequences: Mapped[int] = mapped_column(Integer, nullable=False)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now()
)
def __repr__(self) -> str:
return f"<SequenceAlignment(id={self.id}, name='{self.name}', type='{self.alignment_type}')>"
class StructuralAlignment(Base):
__tablename__ = "structural_alignments"
id: Mapped[int] = mapped_column(primary_key=True)
name: Mapped[str] = mapped_column(String(300), nullable=False)
structure1_id: Mapped[int] = mapped_column(
ForeignKey("protein_structures.id", ondelete="CASCADE"), nullable=False
)
structure2_id: Mapped[int] = mapped_column(
ForeignKey("protein_structures.id", ondelete="CASCADE"), nullable=False
)
method: Mapped[str] = mapped_column(
String(20), nullable=False # "tmalign", "superimposer"
)
tm_score: Mapped[Optional[float]] = mapped_column(Float, nullable=True)
rmsd: Mapped[Optional[float]] = mapped_column(Float, nullable=True)
aligned_length: Mapped[Optional[int]] = mapped_column(Integer, nullable=True)
rotation_matrix: Mapped[Optional[list]] = mapped_column(
JSON, nullable=True # [[r11,r12,r13],[r21,r22,r23],[r31,r32,r33]]
)
translation_vector: Mapped[Optional[list]] = mapped_column(
JSON, nullable=True # [tx, ty, tz]
)
aligned_pdb_data: Mapped[Optional[str]] = mapped_column(
Text, nullable=True # Transformed structure2 PDB data for overlay
)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now()
)
# Relationships
structure1: Mapped[ProteinStructure] = relationship(
foreign_keys=[structure1_id]
)
structure2: Mapped[ProteinStructure] = relationship(
foreign_keys=[structure2_id]
)
def __repr__(self) -> str:
return f"<StructuralAlignment(id={self.id}, tm_score={self.tm_score})>"
Note: Import ProteinStructure from chemlib.models.protein at the module level or use string-based relationship references to avoid circular imports.
Pydantic Schemas¶
chemlib/schemas/protein.py¶
from __future__ import annotations
from datetime import datetime
from typing import Optional
from pydantic import BaseModel, ConfigDict, Field
# --- ProteinTarget ---
class ProteinTargetCreate(BaseModel):
"""Manual creation of a protein target."""
name: str
uniprot_id: str | None = None
gene_name: str | None = None
organism: str | None = None
description: str | None = None
sequence: str | None = None
ec_number: str | None = None
function_description: str | None = None
class ProteinTargetResponse(BaseModel):
id: int
uniprot_id: str | None
pdb_ids: list[str] | None
name: str
gene_name: str | None
organism: str | None
description: str | None
sequence: str | None
sequence_length: int | None
ec_number: str | None
function_description: str | None
created_at: datetime
updated_at: datetime
structure_count: int = 0 # Computed field
model_config = ConfigDict(from_attributes=True)
class ProteinTargetFilter(BaseModel):
name: str | None = None
gene_name: str | None = None
organism: str | None = None
ec_number: str | None = None
limit: int = 50
offset: int = 0
class ProteinTargetListResponse(BaseModel):
items: list[ProteinTargetResponse]
total: int
# --- ProteinStructure ---
class ProteinStructureCreate(BaseModel):
"""Upload a structure manually."""
protein_target_id: int
pdb_id: str | None = None
source: str = "user_upload" # "rcsb", "alphafold", "user_upload"
file_format: str = "pdb" # "pdb", "mmcif"
structure_data: str # Full PDB/mmCIF content
resolution: float | None = None
method: str | None = None # "xray", "nmr", "cryo_em", "predicted"
class ProteinStructureResponse(BaseModel):
id: int
protein_target_id: int
pdb_id: str | None
source: str
file_format: str
resolution: float | None
method: str | None
chains: list[str] | None
has_ligand: bool
ligand_ids: list[str] | None
created_at: datetime
# Note: structure_data excluded from list responses (too large)
model_config = ConfigDict(from_attributes=True)
class ProteinStructureDetailResponse(ProteinStructureResponse):
"""Includes full structure data for detail/download endpoints."""
structure_data: str
# --- BindingSite ---
class BindingSiteCreate(BaseModel):
"""Manually define a binding site."""
name: str
center_x: float
center_y: float
center_z: float
box_size_x: float = 20.0
box_size_y: float = 20.0
box_size_z: float = 20.0
residues: list[str] | None = None # ["A:GLU45", "A:ASP52"]
class BindingSiteFromLigand(BaseModel):
"""Define binding site from a co-crystallized ligand."""
ligand_id: str # e.g., "ATP"
padding: float = 5.0 # Angstroms around ligand
class BindingSiteResponse(BaseModel):
id: int
protein_structure_id: int
name: str
center_x: float
center_y: float
center_z: float
box_size_x: float
box_size_y: float
box_size_z: float
residues: list[str] | None
druggability_score: float | None
volume: float | None
detection_method: str
created_at: datetime
model_config = ConfigDict(from_attributes=True)
# --- Alignment ---
class SequenceAlignmentRequest(BaseModel):
"""Request a sequence alignment."""
name: str
sequences: list[SequenceInput]
method: str = "biopython" # "biopython" (pairwise), "mafft", "clustalo"
class SequenceInput(BaseModel):
id: str # identifier (UniProt ID, custom name)
name: str # display name
sequence: str # amino acid sequence
class SequenceAlignmentResponse(BaseModel):
id: int
name: str
alignment_type: str
method: str
num_sequences: int
score: float | None
identity_pct: float | None
alignment_data: str # aligned FASTA
created_at: datetime
model_config = ConfigDict(from_attributes=True)
class StructuralAlignmentRequest(BaseModel):
name: str
structure1_id: int
structure2_id: int
method: str = "tmalign" # "tmalign", "superimposer"
class StructuralAlignmentResponse(BaseModel):
id: int
name: str
structure1_id: int
structure2_id: int
method: str
tm_score: float | None
rmsd: float | None
aligned_length: int | None
aligned_pdb_data: str | None
created_at: datetime
model_config = ConfigDict(from_attributes=True)
Service Layer¶
chemlib/services/protein_target_service.py¶
class ProteinTargetService:
"""Business logic for protein target management."""
async def import_from_uniprot(
self, db: AsyncSession, accession: str
) -> ProteinTargetResponse:
"""
Fetch a protein from UniProt REST API and store it.
Steps:
1. Check if target with this uniprot_id already exists → return existing
2. Call UniProt REST API: GET https://rest.uniprot.org/uniprotkb/{accession}.json
3. Parse response: extract proteinDescription.recommendedName.fullName,
gene.geneName.value, organism.scientificName, sequence.value,
sequence.length, proteinDescription.ecNumbers, comments (FUNCTION)
4. Also fetch associated PDB IDs from cross-references
5. Create ProteinTarget record
6. Return response
"""
async def import_from_pdb(
self, db: AsyncSession, pdb_id: str
) -> ProteinTargetResponse:
"""
Import a protein target by first looking up its UniProt mapping from RCSB,
then importing from UniProt. Falls back to PDB metadata if no UniProt mapping.
Steps:
1. Call RCSB API to get UniProt mapping:
GET https://data.rcsb.org/rest/v1/core/entry/{pdb_id}
2. Extract UniProt accessions from polymer_entities
3. If found, call import_from_uniprot()
4. If not, create target from PDB metadata (entity name, organism, sequence)
5. Also fetch and store the structure (delegates to ProteinStructureService)
"""
async def search_uniprot(
self, query: str, limit: int = 25
) -> list[dict]:
"""
Search UniProt for proteins matching a query string.
Does NOT store results — returns search hits for user selection.
Call: GET https://rest.uniprot.org/uniprotkb/search?query={query}&size={limit}&format=json
Returns: list of {accession, name, gene, organism, sequence_length}
"""
async def get(self, db: AsyncSession, target_id: int) -> ProteinTargetResponse | None:
"""Get a protein target by ID, including structure count."""
async def list_targets(
self, db: AsyncSession, filters: ProteinTargetFilter
) -> ProteinTargetListResponse:
"""List protein targets with optional filters and pagination."""
async def delete(self, db: AsyncSession, target_id: int) -> bool:
"""Delete a protein target and all associated structures (cascade)."""
async def update(
self, db: AsyncSession, target_id: int, data: ProteinTargetCreate
) -> ProteinTargetResponse:
"""Update protein target metadata."""
chemlib/services/protein_structure_service.py¶
class ProteinStructureService:
"""Business logic for protein structure management."""
async def fetch_from_rcsb(
self, db: AsyncSession, pdb_id: str, protein_target_id: int
) -> ProteinStructureResponse:
"""
Fetch a PDB structure from RCSB and store it.
Steps:
1. Download PDB file: GET https://files.rcsb.org/download/{pdb_id}.pdb
2. Parse with Biopython Bio.PDB.PDBParser to extract:
- Chain IDs
- Resolution (from header)
- Method (X-RAY, NMR, etc.)
- Ligand HET codes (non-standard residues)
3. Store full PDB content in structure_data
4. Create ProteinStructure record
"""
async def fetch_from_alphafold(
self, db: AsyncSession, uniprot_id: str, protein_target_id: int
) -> ProteinStructureResponse:
"""
Fetch a predicted structure from AlphaFold DB.
Steps:
1. GET https://alphafold.ebi.ac.uk/api/prediction/{uniprot_id}
2. Parse response to get PDB URL (pdbUrl field)
3. Download PDB file from that URL
4. Store with source="alphafold", method="predicted"
"""
async def upload_structure(
self, db: AsyncSession, protein_target_id: int,
file_data: str, file_format: str = "pdb",
pdb_id: str | None = None
) -> ProteinStructureResponse:
"""
Store a user-uploaded PDB/mmCIF structure.
Steps:
1. Validate the structure data (attempt to parse with Biopython)
2. Extract chain IDs, ligands, method from header
3. Store with source="user_upload"
"""
async def get(self, db: AsyncSession, structure_id: int) -> ProteinStructureDetailResponse | None:
"""Get a structure by ID, including full structure_data."""
async def get_chains(self, db: AsyncSession, structure_id: int) -> list[dict]:
"""
Parse the structure and return chain info.
Returns: [{"chain_id": "A", "length": 320, "first_residue": 1, "last_residue": 320}, ...]
"""
async def extract_sequence(
self, db: AsyncSession, structure_id: int, chain_id: str
) -> str:
"""
Extract amino acid sequence from a specific chain using Biopython.
Returns: one-letter amino acid sequence string.
"""
async def list_for_target(
self, db: AsyncSession, protein_target_id: int
) -> list[ProteinStructureResponse]:
"""List all structures associated with a protein target."""
async def delete(self, db: AsyncSession, structure_id: int) -> bool:
"""Delete a structure and all associated binding sites (cascade)."""
chemlib/services/binding_site_service.py¶
class BindingSiteService:
"""Business logic for binding site detection and management."""
async def detect_pockets(
self, db: AsyncSession, structure_id: int,
min_druggability: float = 0.0
) -> list[BindingSiteResponse]:
"""
Run Fpocket on a protein structure and store detected pockets.
Steps:
1. Get structure from DB, write structure_data to temp file
2. Run: fpocket -f /tmp/{pdb_id}.pdb
3. Parse Fpocket output:
- Read {pdb_id}_out/{pdb_id}_info.txt for pocket scores
- Read {pdb_id}_out/pockets/ for pocket PDB files
- For each pocket: extract center (average of alpha sphere coords),
bounding box, residues, druggability score, volume
4. Filter by min_druggability
5. Create BindingSite records with detection_method="fpocket"
6. Clean up temp files
7. Return sorted by druggability_score descending
"""
async def define_from_ligand(
self, db: AsyncSession, structure_id: int,
ligand_id: str, padding: float = 5.0
) -> BindingSiteResponse:
"""
Define a binding site based on a co-crystallized ligand's position.
Steps:
1. Parse structure, find HETATM records matching ligand_id
2. Compute ligand center of mass (center_x/y/z)
3. Compute bounding box of ligand atoms + padding on each side
4. Identify protein residues within padding distance of any ligand atom
5. Create BindingSite with detection_method="ligand_based"
"""
async def define_manual(
self, db: AsyncSession, structure_id: int,
data: BindingSiteCreate
) -> BindingSiteResponse:
"""
Store a manually defined binding site.
User provides center and box dimensions directly.
"""
async def get(self, db: AsyncSession, site_id: int) -> BindingSiteResponse | None:
"""Get a binding site by ID."""
async def list_for_structure(
self, db: AsyncSession, structure_id: int
) -> list[BindingSiteResponse]:
"""List all binding sites for a structure."""
async def delete(self, db: AsyncSession, site_id: int) -> bool:
"""Delete a binding site."""
chemlib/services/alignment_service.py¶
class AlignmentService:
"""Sequence and structural alignment operations."""
async def pairwise_sequence_align(
self, db: AsyncSession,
name: str,
seq1: SequenceInput, seq2: SequenceInput,
method: str = "biopython",
matrix: str = "BLOSUM62",
gap_open: float = -10.0,
gap_extend: float = -0.5
) -> SequenceAlignmentResponse:
"""
Perform pairwise sequence alignment.
Method "biopython":
1. Use Bio.Align.PairwiseAligner with substitution_matrix=BLOSUM62
2. Set open_gap_score, extend_gap_score
3. Run aligner.align(seq1, seq2)
4. Take best alignment, compute identity percentage
5. Format as aligned FASTA
6. Store SequenceAlignment record with alignment_type="pairwise"
Method "mafft":
1. Write both sequences to temp FASTA file
2. Run: mafft --auto /tmp/input.fasta > /tmp/output.fasta
3. Parse output alignment
"""
async def multiple_sequence_align(
self, db: AsyncSession,
name: str,
sequences: list[SequenceInput],
method: str = "mafft"
) -> SequenceAlignmentResponse:
"""
Perform multiple sequence alignment.
Method "mafft" (recommended for 3+ sequences):
1. Write sequences to temp FASTA
2. Run: mafft --auto /tmp/input.fasta > /tmp/output.fasta
3. Parse aligned FASTA output
4. Compute pairwise identity percentages (average)
5. Store with alignment_type="multiple"
Method "clustalo":
1. Write sequences to temp FASTA
2. Run: clustalo -i /tmp/input.fasta -o /tmp/output.fasta --outfmt=fasta
3. Parse and store
"""
async def structural_align(
self, db: AsyncSession,
name: str,
structure1_id: int, structure2_id: int,
method: str = "tmalign",
chain1: str = "A", chain2: str = "A"
) -> StructuralAlignmentResponse:
"""
Perform structural alignment between two protein structures.
Method "tmalign" (via tmtools):
1. Get both structures from DB
2. Parse with Biopython, extract CA atoms for specified chains
3. Build coordinate arrays (N x 3)
4. Call tmtools.tm_align(coords1, coords2, seq1, seq2)
5. Extract: TM-score, RMSD, aligned_length, rotation_matrix, translation_vector
6. Apply transformation to structure2 PDB data to generate aligned_pdb_data
7. Store StructuralAlignment record
Method "superimposer" (via Biopython):
1. Parse structures, extract CA atoms
2. Use Bio.PDB.Superimposer to compute RMSD and rotation/translation
3. Apply to all atoms of structure2
4. Store results
"""
async def get_sequence_alignment(
self, db: AsyncSession, alignment_id: int
) -> SequenceAlignmentResponse | None:
"""Get a sequence alignment by ID."""
async def get_structural_alignment(
self, db: AsyncSession, alignment_id: int
) -> StructuralAlignmentResponse | None:
"""Get a structural alignment by ID."""
async def list_sequence_alignments(
self, db: AsyncSession, limit: int = 50, offset: int = 0
) -> list[SequenceAlignmentResponse]:
"""List all sequence alignments."""
async def generate_alignment_image(
self, alignment_id: int
) -> bytes:
"""
Generate a static alignment image (PNG) using pyMSAviz.
Reads alignment_data from the DB record, renders as a color-coded image.
Returns PNG bytes.
"""
Bioinformatics Utility Layer¶
chemlib/bioinformatics/pdb_parser.py¶
"""Pure PDB/mmCIF parsing utilities using Biopython. No DB access."""
from Bio.PDB import PDBParser, MMCIFParser, PDBIO
from Bio.PDB.Structure import Structure
from Bio.PDB.Chain import Chain
from Bio.PDB.Residue import Residue
import io
def parse_pdb_string(pdb_data: str, structure_id: str = "X") -> Structure:
"""Parse PDB format string into Biopython Structure object."""
parser = PDBParser(QUIET=True)
handle = io.StringIO(pdb_data)
return parser.get_structure(structure_id, handle)
def parse_mmcif_string(mmcif_data: str, structure_id: str = "X") -> Structure:
"""Parse mmCIF format string into Biopython Structure object."""
parser = MMCIFParser(QUIET=True)
handle = io.StringIO(mmcif_data)
return parser.get_structure(structure_id, handle)
def extract_chains(structure: Structure) -> list[dict]:
"""
Extract chain info from a Biopython Structure.
Returns: [{"chain_id": "A", "length": 320, "residues": [...], "sequence": "MRPS..."}]
"""
def extract_ligands(structure: Structure) -> list[dict]:
"""
Extract non-standard residues (HETATMs) that are ligands.
Excludes water (HOH/WAT) and common ions.
Returns: [{"het_id": "ATP", "chain": "A", "res_num": 501, "num_atoms": 31}]
"""
def extract_sequence_from_chain(chain: Chain) -> str:
"""Extract one-letter amino acid sequence from a chain."""
def get_resolution(structure: Structure) -> float | None:
"""Extract resolution from PDB header, if available."""
def get_method(structure: Structure) -> str | None:
"""Extract experimental method from PDB header."""
def compute_center_of_mass(residues: list[Residue]) -> tuple[float, float, float]:
"""Compute center of mass from a list of residues."""
def get_residues_near_point(
structure: Structure, center: tuple[float, float, float],
radius: float, chain_id: str | None = None
) -> list[str]:
"""
Find residues within radius of a point.
Returns: ["A:GLU45", "A:ASP52", ...]
"""
def structure_to_pdb_string(structure: Structure) -> str:
"""Convert a Biopython Structure back to PDB format string."""
output = io.StringIO()
io_obj = PDBIO()
io_obj.set_structure(structure)
io_obj.save(output)
return output.getvalue()
chemlib/bioinformatics/sequence_tools.py¶
"""Sequence alignment utilities. No DB access."""
from Bio.Align import PairwiseAligner, substitution_matrices
import subprocess
import tempfile
from pathlib import Path
def pairwise_align_biopython(
seq1: str, seq2: str,
matrix: str = "BLOSUM62",
gap_open: float = -10.0,
gap_extend: float = -0.5
) -> dict:
"""
Pairwise sequence alignment using Biopython.
Returns: {
"aligned_seq1": "MRPS-GTAGC...",
"aligned_seq2": "MRP--GTAGC...",
"score": 245.0,
"identity_pct": 78.5,
"alignment_fasta": ">seq1\nMRPS-GTAGC...\n>seq2\nMRP--GTAGC...\n"
}
"""
def multiple_align_mafft(
sequences: list[dict], # [{"id": "...", "name": "...", "sequence": "..."}]
mafft_binary: str = "mafft"
) -> dict:
"""
MSA using MAFFT subprocess.
Writes input FASTA to temp file, runs MAFFT, parses output.
Returns: {
"alignment_fasta": ">seq1\nMR-PS...\n>seq2\nMRPPS...\n",
"num_sequences": 5,
"alignment_length": 320
}
"""
def multiple_align_clustalo(
sequences: list[dict],
clustalo_binary: str = "clustalo"
) -> dict:
"""MSA using Clustal Omega subprocess. Same interface as MAFFT."""
def compute_identity(aligned_seq1: str, aligned_seq2: str) -> float:
"""Compute percent identity from two aligned sequences (with gaps)."""
def format_alignment_fasta(
sequences: list[dict], # [{"id": "...", "aligned_sequence": "..."}]
) -> str:
"""Format aligned sequences as FASTA string."""
chemlib/bioinformatics/structural_tools.py¶
"""Structural alignment utilities using tmtools and Biopython."""
import numpy as np
from Bio.PDB import Superimposer
from Bio.PDB.Structure import Structure
def tm_align(
structure1: Structure, structure2: Structure,
chain1: str = "A", chain2: str = "A"
) -> dict:
"""
Structural alignment using TM-align (via tmtools).
Steps:
1. Extract CA atom coordinates from specified chains
2. Extract sequences from chains
3. Call tmtools.tm_align(coords1, coords2, seq1, seq2)
4. Return results
Returns: {
"tm_score": 0.85,
"rmsd": 1.42,
"aligned_length": 280,
"rotation_matrix": [[r11,...], [r21,...], [r31,...]],
"translation_vector": [tx, ty, tz],
}
"""
def superimpose_biopython(
structure1: Structure, structure2: Structure,
chain1: str = "A", chain2: str = "A"
) -> dict:
"""
Superimpose structure2 onto structure1 using Biopython Superimposer.
Uses CA atoms for alignment.
Returns same dict format as tm_align (without tm_score).
"""
def apply_transformation(
structure: Structure,
rotation_matrix: list[list[float]],
translation_vector: list[float]
) -> Structure:
"""
Apply rotation + translation to all atoms of a structure.
Returns a new transformed Structure.
"""
def extract_ca_coords(structure: Structure, chain_id: str) -> np.ndarray:
"""Extract CA atom coordinates as Nx3 numpy array."""
chemlib/bioinformatics/pocket_detection.py¶
"""Fpocket integration for binding pocket detection."""
import subprocess
import tempfile
from pathlib import Path
def run_fpocket(
pdb_data: str,
fpocket_binary: str = "fpocket"
) -> list[dict]:
"""
Run Fpocket on a PDB structure and parse results.
Steps:
1. Write pdb_data to temp file: /tmp/fpocket_{uuid}.pdb
2. Run: {fpocket_binary} -f /tmp/fpocket_{uuid}.pdb
3. Parse output directory: /tmp/fpocket_{uuid}_out/
- pockets/pocket{n}_atm.pdb for pocket atom coordinates
- fpocket_{uuid}_info.txt for scores (druggability, volume, etc.)
4. For each pocket:
- Compute center from alpha sphere coordinates
- Compute bounding box
- Extract residue list
- Read druggability score and volume
5. Clean up temp files
6. Return sorted by druggability_score descending
Returns: [{
"pocket_number": 1,
"center": (x, y, z),
"box_size": (sx, sy, sz),
"residues": ["A:GLU45", "A:ASP52"],
"druggability_score": 0.82,
"volume": 456.7,
"num_alpha_spheres": 45
}, ...]
"""
def parse_fpocket_info(info_file: Path) -> list[dict]:
"""Parse Fpocket info.txt file to extract per-pocket metrics."""
def parse_pocket_pdb(pocket_pdb: Path) -> dict:
"""Parse a pocket PDB file to extract center, residues, atoms."""
chemlib/bioinformatics/protein_prep.py¶
"""Protein preparation using PDBFixer."""
from pdbfixer import PDBFixer
import io
def fix_protein(pdb_data: str) -> str:
"""
Prepare a protein structure for docking.
Steps using PDBFixer:
1. Load PDB data
2. Find missing residues → add them
3. Find missing atoms → add them
4. Remove heterogens (keep water optionally)
5. Add hydrogens at pH 7.0
6. Return fixed PDB string
Returns: fixed PDB data as string
"""
def remove_water(pdb_data: str) -> str:
"""Remove all water molecules from PDB data."""
def remove_heterogens(pdb_data: str, keep_water: bool = False) -> str:
"""Remove all HETATM records (optionally keep water)."""
def add_hydrogens(pdb_data: str, ph: float = 7.0) -> str:
"""Add hydrogens at specified pH using PDBFixer."""
chemlib/bioinformatics/external_apis.py¶
"""HTTP clients for external bioinformatics databases."""
import httpx
class UniProtClient:
"""Client for UniProt REST API (https://rest.uniprot.org)."""
BASE_URL = "https://rest.uniprot.org"
async def fetch_entry(self, accession: str) -> dict:
"""
GET /uniprotkb/{accession}.json
Returns parsed JSON with protein metadata.
"""
async def search(self, query: str, limit: int = 25) -> list[dict]:
"""
GET /uniprotkb/search?query={query}&size={limit}&format=json
Returns list of search results.
"""
def parse_entry(self, data: dict) -> dict:
"""
Parse UniProt JSON entry into our internal format:
{
"accession": "P00533",
"name": "Epidermal growth factor receptor",
"gene_name": "EGFR",
"organism": "Homo sapiens",
"sequence": "MRPSGTAGAALL...",
"sequence_length": 1210,
"ec_number": "2.7.10.1",
"function": "Receptor tyrosine kinase...",
"pdb_ids": ["1M17", "3W2S", ...]
}
"""
class RCSBClient:
"""Client for RCSB PDB REST API (https://data.rcsb.org)."""
BASE_URL = "https://data.rcsb.org"
FILES_URL = "https://files.rcsb.org"
async def fetch_entry_info(self, pdb_id: str) -> dict:
"""GET /rest/v1/core/entry/{pdb_id} — entry metadata."""
async def download_pdb(self, pdb_id: str) -> str:
"""GET https://files.rcsb.org/download/{pdb_id}.pdb — raw PDB file."""
async def download_mmcif(self, pdb_id: str) -> str:
"""GET https://files.rcsb.org/download/{pdb_id}.cif — raw mmCIF file."""
async def get_uniprot_mapping(self, pdb_id: str) -> list[str]:
"""Extract UniProt accessions from PDB entry metadata."""
class AlphaFoldClient:
"""Client for AlphaFold DB API (https://alphafold.ebi.ac.uk/api)."""
BASE_URL = "https://alphafold.ebi.ac.uk/api"
async def fetch_prediction(self, uniprot_id: str) -> dict:
"""GET /prediction/{uniprot_id} — prediction metadata including PDB URL."""
async def download_pdb(self, uniprot_id: str) -> str:
"""Download the predicted PDB file."""
API Endpoints¶
Protein Targets — chemlib/api/targets.py¶
| Method | Endpoint | Description | Request Body | Response |
|---|---|---|---|---|
GET |
/api/targets/ |
List all protein targets | Query: ProteinTargetFilter | ProteinTargetListResponse |
POST |
/api/targets/ |
Create a protein target manually | ProteinTargetCreate | ProteinTargetResponse |
GET |
/api/targets/{id} |
Get target details | — | ProteinTargetResponse |
PUT |
/api/targets/{id} |
Update target metadata | ProteinTargetCreate | ProteinTargetResponse |
DELETE |
/api/targets/{id} |
Delete target + structures | — | 204 |
GET |
/api/targets/{id}/structures |
List structures for target | — | list[ProteinStructureResponse] |
Protein Structures — chemlib/api/structures.py¶
| Method | Endpoint | Description | Request Body | Response |
|---|---|---|---|---|
GET |
/api/structures/{id} |
Get structure details (incl. data) | — | ProteinStructureDetailResponse |
POST |
/api/structures/ |
Upload a structure | ProteinStructureCreate | ProteinStructureResponse |
DELETE |
/api/structures/{id} |
Delete structure | — | 204 |
GET |
/api/structures/{id}/chains |
Get chain info | — | list[dict] |
GET |
/api/structures/{id}/sequence/{chain} |
Extract sequence for chain | — | {"sequence": "MRPS..."} |
GET |
/api/structures/{id}/binding-sites |
List binding sites | — | list[BindingSiteResponse] |
POST |
/api/structures/{id}/binding-sites |
Define a binding site manually | BindingSiteCreate | BindingSiteResponse |
POST |
/api/structures/{id}/binding-sites/from-ligand |
Define from ligand | BindingSiteFromLigand | BindingSiteResponse |
POST |
/api/structures/{id}/detect-pockets |
Run Fpocket | Query: min_druggability | list[BindingSiteResponse] |
External Fetch — chemlib/api/targets.py (or separate)¶
| Method | Endpoint | Description | Response |
|---|---|---|---|
POST |
/api/fetch/uniprot/{accession} |
Import target from UniProt | ProteinTargetResponse |
POST |
/api/fetch/rcsb/{pdb_id} |
Fetch structure from RCSB | ProteinStructureResponse |
POST |
/api/fetch/alphafold/{uniprot_id} |
Fetch from AlphaFold DB | ProteinStructureResponse |
GET |
/api/search/uniprot |
Search UniProt | Query: q (string) |
Alignments — chemlib/api/alignments.py¶
| Method | Endpoint | Description | Request Body | Response |
|---|---|---|---|---|
POST |
/api/alignments/sequence |
Run sequence alignment | SequenceAlignmentRequest | SequenceAlignmentResponse |
GET |
/api/alignments/sequence/{id} |
Get alignment result | — | SequenceAlignmentResponse |
GET |
/api/alignments/sequence/{id}/image |
Get alignment image (PNG) | — | image/png |
GET |
/api/alignments/sequence |
List sequence alignments | Query: limit, offset | list[SequenceAlignmentResponse] |
POST |
/api/alignments/structure |
Run structural alignment | StructuralAlignmentRequest | StructuralAlignmentResponse |
GET |
/api/alignments/structure/{id} |
Get structural alignment | — | StructuralAlignmentResponse |
GET |
/api/alignments/structure |
List structural alignments | Query: limit, offset | list[StructuralAlignmentResponse] |
Visualization¶
3D Protein Viewer (3Dmol.js)¶
The existing 3Dmol.js integration is extended for protein structures. The viewer is a reusable component in chemlib/static/js/protein_viewer.js.
Viewer Modes:
| Mode | 3Dmol.js Style | Use Case |
|---|---|---|
| Cartoon | setStyle({cartoon: {color: 'spectrum'}}) |
Default protein view |
| Surface | addSurface(...) with transparency |
Show molecular surface |
| Ball-and-stick | setStyle({stick: {}, sphere: {scale: 0.3}}) |
Ligand detail |
| Ribbon | setStyle({cartoon: {style: 'trace'}}) |
Simplified backbone |
Binding Site Visualization:
- Box overlay: addBox({center: {x, y, z}, dimensions: {w, h, d}, color: 'green', opacity: 0.3})
- Surface coloring: color residues in the binding site differently from the rest of the protein
- Residue highlighting: setStyle({resi: [45, 52, 721], chain: 'A'}, {stick: {color: 'yellow'}})
Protein Viewer JavaScript API (protein_viewer.js):
class ProteinViewer {
constructor(containerId) { /* Initialize 3Dmol.GLViewer */ }
async loadProtein(structureId) {
// Fetch PDB data from /api/structures/{id}
// viewer.addModel(data, "pdb")
// viewer.setStyle({}, {cartoon: {color: 'spectrum'}})
}
showBindingSite(siteData) {
// Draw box around binding site
// Highlight residues
}
showLigand(ligandId) {
// Show co-crystallized ligand in ball-and-stick
}
overlayStructure(pdbData, color) {
// Add second structure for superposition view
// viewer.addModel(pdbData, "pdb")
// viewer.setStyle({model: 1}, {cartoon: {color: color}})
}
setStyle(style) { /* Switch between cartoon, surface, stick */ }
zoomToSite(siteData) { /* Center view on binding site */ }
clear() { /* Remove all models */ }
}
Sequence Alignment Viewer (BioJS MSA Viewer)¶
For interactive, scrollable sequence alignment visualization in the browser. Loaded from CDN or bundled.
<!-- alignment_viewer.html -->
<div id="msa-viewer"></div>
<script>
// Fetch alignment data
const response = await fetch(`/api/alignments/sequence/${alignmentId}`);
const data = await response.json();
// Initialize MSA viewer
const msa = require("msa");
const viewer = msa({
el: document.getElementById("msa-viewer"),
seqs: parseFasta(data.alignment_data),
colorscheme: {"scheme": "clustal"}, // or "zappo", "hydrophobicity"
vis: {
conserv: true, // Show conservation track
overviewbox: true // Show minimap
}
});
viewer.render();
</script>
Color Schemes: Clustal (default), Zappo (physicochemical), Hydrophobicity, Taylor, Buried.
Static Alignment Images (pyMSAviz)¶
For server-side rendering (e.g., reports, downloads).
# In alignment_service.py
from pymsaviz import MsaViz
def generate_alignment_image(alignment_fasta: str) -> bytes:
mv = MsaViz(alignment_fasta, wrap_length=80, show_count=True)
mv.set_plot_params(color_scheme="Clustal")
buf = io.BytesIO()
mv.savefig(buf, format="png", dpi=150)
return buf.getvalue()
Structural Superposition Viewer¶
When viewing a structural alignment result, the 3D viewer loads both structures:
// In protein_viewer.js
async function showStructuralAlignment(alignmentId) {
const data = await fetch(`/api/alignments/structure/${alignmentId}`).then(r => r.json());
// Load structure 1 (reference) in blue
const struct1 = await fetch(`/api/structures/${data.structure1_id}`).then(r => r.json());
viewer.addModel(struct1.structure_data, "pdb");
viewer.setStyle({model: 0}, {cartoon: {color: '#3498db'}});
// Load structure 2 (aligned/transformed) in red
viewer.addModel(data.aligned_pdb_data, "pdb");
viewer.setStyle({model: 1}, {cartoon: {color: '#e74c3c'}});
// Show metrics overlay
showMetrics({tm_score: data.tm_score, rmsd: data.rmsd, aligned_length: data.aligned_length});
viewer.zoomTo();
viewer.render();
}
UI Pages¶
Protein Browser (protein_browser.html)¶
┌─────────────────────────────────────────────────────────────────┐
│ Protein Target Library [Import from UniProt] │
├─────────────────────────────────────────────────────────────────┤
│ Search: [________________] [Organism: ▾] [EC: ▾] [Search] │
├─────────────────────────────────────────────────────────────────┤
│ ┌─────────┬──────────┬───────────┬──────────┬────────┬──────┐ │
│ │ Name │ Gene │ Organism │ UniProt │ # Str │ Act │ │
│ ├─────────┼──────────┼───────────┼──────────┼────────┼──────┤ │
│ │ EGFR │ EGFR │ H.sapiens │ P00533 │ 12 │ View │ │
│ │ BRAF │ BRAF │ H.sapiens │ P15056 │ 8 │ View │ │
│ │ ... │ │ │ │ │ │ │
│ └─────────┴──────────┴───────────┴──────────┴────────┴──────┘ │
│ ◀ 1 2 3 ... ▶ │
└─────────────────────────────────────────────────────────────────┘
Protein Detail (protein_detail.html)¶
┌─────────────────────────────────────────────────────────────────┐
│ ◀ Back to Library │
│ │
│ EGFR — Epidermal Growth Factor Receptor │
│ Gene: EGFR | Organism: Homo sapiens | UniProt: P00533 │
│ EC: 2.7.10.1 | Length: 1210 aa │
│ │
│ Function: Receptor tyrosine kinase binding ligands... │
│ │
│ ┌──────────────────────────────────────────────────────────┐ │
│ │ 3D Structure Viewer (3Dmol.js) │ │
│ │ │ │
│ │ [Cartoon ribbon of selected structure] │ │
│ │ │ │
│ │ Style: [Cartoon ▾] Chain: [A ▾] Show binding sites: ☑ │ │
│ └──────────────────────────────────────────────────────────┘ │
│ │
│ Structures: │
│ ┌──────┬────────┬──────────┬──────┬───────────┬─────────────┐ │
│ │ PDB │ Source │ Method │ Res │ Chains │ Actions │ │
│ ├──────┼────────┼──────────┼──────┼───────────┼─────────────┤ │
│ │ 1M17 │ RCSB │ X-ray │ 2.6Å │ A │ View|Sites │ │
│ │ AF- │ AlphaF │ Predicted│ — │ A │ View|Sites │ │
│ └──────┴────────┴──────────┴──────┴───────────┴─────────────┘ │
│ [+ Fetch from RCSB] [+ Fetch from AlphaFold] [+ Upload] │
│ │
│ Binding Sites (for 1M17): │
│ ┌───────────────┬────────────────┬───────┬────────────┬──────┐ │
│ │ Name │ Method │ Score │ Volume │ Act │ │
│ ├───────────────┼────────────────┼───────┼────────────┼──────┤ │
│ │ Pocket 1 │ Fpocket │ 0.89 │ 567 A³ │ View │ │
│ │ ATP site │ Ligand-based │ — │ 423 A³ │ View │ │
│ └───────────────┴────────────────┴───────┴────────────┴──────┘ │
│ [Detect Pockets (Fpocket)] [Define from Ligand] [Define Manual]│
│ │
│ Sequence: │
│ MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQRMFNNCEVVL... │
│ [Copy] [Align with...] [BLAST search] │
└─────────────────────────────────────────────────────────────────┘
Alignment Viewer (alignment_viewer.html)¶
┌─────────────────────────────────────────────────────────────────┐
│ Sequence Alignment: EGFR_HUMAN vs ERBB2_HUMAN │
│ Method: BLOSUM62 | Identity: 42.3% | Score: 856.0 │
│ │
│ ┌──────────────────────────────────────────────────────────┐ │
│ │ [Interactive MSA Viewer — BioJS] │ │
│ │ Color scheme: [Clustal ▾] Wrap: [80 ▾] │ │
│ │ │ │
│ │ EGFR_HUMAN MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSN----KL │ │
│ │ ERBB2_HUM MELAALCRWGLLLALLPPGA--ASTQVCTGTDMKLRL---PN │ │
│ │ ** * * ** * * * * │ │
│ │ [scrollable, with conservation bars below] │ │
│ └──────────────────────────────────────────────────────────┘ │
│ │
│ [Download FASTA] [Download Image (PNG)] │
└─────────────────────────────────────────────────────────────────┘
Alembic Migration¶
The migration adds all new tables in a single migration file:
# alembic/versions/xxxx_add_protein_target_tables.py
def upgrade():
op.create_table(
"protein_targets",
sa.Column("id", sa.Integer, primary_key=True),
sa.Column("uniprot_id", sa.String(20), unique=True, index=True, nullable=True),
sa.Column("pdb_ids", sa.JSON, nullable=True),
sa.Column("name", sa.String(500), nullable=False),
sa.Column("gene_name", sa.String(100), nullable=True),
sa.Column("organism", sa.String(200), nullable=True),
sa.Column("description", sa.Text, nullable=True),
sa.Column("sequence", sa.Text, nullable=True),
sa.Column("sequence_length", sa.Integer, nullable=True),
sa.Column("ec_number", sa.String(50), nullable=True),
sa.Column("function_description", sa.Text, nullable=True),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now()),
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.func.now()),
)
op.create_table(
"protein_structures",
sa.Column("id", sa.Integer, primary_key=True),
sa.Column("protein_target_id", sa.Integer, sa.ForeignKey("protein_targets.id", ondelete="CASCADE"), nullable=False, index=True),
sa.Column("pdb_id", sa.String(10), nullable=True, index=True),
sa.Column("source", sa.String(20), nullable=False),
sa.Column("file_format", sa.String(10), nullable=False, default="pdb"),
sa.Column("structure_data", sa.Text, nullable=False),
sa.Column("resolution", sa.Float, nullable=True),
sa.Column("method", sa.String(20), nullable=True),
sa.Column("chains", sa.JSON, nullable=True),
sa.Column("has_ligand", sa.Boolean, default=False),
sa.Column("ligand_ids", sa.JSON, nullable=True),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now()),
)
op.create_table(
"binding_sites",
sa.Column("id", sa.Integer, primary_key=True),
sa.Column("protein_structure_id", sa.Integer, sa.ForeignKey("protein_structures.id", ondelete="CASCADE"), nullable=False, index=True),
sa.Column("name", sa.String(200), nullable=False),
sa.Column("center_x", sa.Float, nullable=False),
sa.Column("center_y", sa.Float, nullable=False),
sa.Column("center_z", sa.Float, nullable=False),
sa.Column("box_size_x", sa.Float, nullable=False, default=20.0),
sa.Column("box_size_y", sa.Float, nullable=False, default=20.0),
sa.Column("box_size_z", sa.Float, nullable=False, default=20.0),
sa.Column("residues", sa.JSON, nullable=True),
sa.Column("druggability_score", sa.Float, nullable=True),
sa.Column("volume", sa.Float, nullable=True),
sa.Column("detection_method", sa.String(20), nullable=False, default="manual"),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now()),
)
op.create_table(
"sequence_alignments",
sa.Column("id", sa.Integer, primary_key=True),
sa.Column("name", sa.String(300), nullable=False),
sa.Column("alignment_type", sa.String(20), nullable=False),
sa.Column("method", sa.String(20), nullable=False),
sa.Column("input_sequences", sa.JSON, nullable=False),
sa.Column("alignment_data", sa.Text, nullable=False),
sa.Column("score", sa.Float, nullable=True),
sa.Column("identity_pct", sa.Float, nullable=True),
sa.Column("num_sequences", sa.Integer, nullable=False),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now()),
)
op.create_table(
"structural_alignments",
sa.Column("id", sa.Integer, primary_key=True),
sa.Column("name", sa.String(300), nullable=False),
sa.Column("structure1_id", sa.Integer, sa.ForeignKey("protein_structures.id", ondelete="CASCADE"), nullable=False),
sa.Column("structure2_id", sa.Integer, sa.ForeignKey("protein_structures.id", ondelete="CASCADE"), nullable=False),
sa.Column("method", sa.String(20), nullable=False),
sa.Column("tm_score", sa.Float, nullable=True),
sa.Column("rmsd", sa.Float, nullable=True),
sa.Column("aligned_length", sa.Integer, nullable=True),
sa.Column("rotation_matrix", sa.JSON, nullable=True),
sa.Column("translation_vector", sa.JSON, nullable=True),
sa.Column("aligned_pdb_data", sa.Text, nullable=True),
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.func.now()),
)
def downgrade():
op.drop_table("structural_alignments")
op.drop_table("sequence_alignments")
op.drop_table("binding_sites")
op.drop_table("protein_structures")
op.drop_table("protein_targets")
Testing Strategy¶
| Component | Test Type | Fixtures |
|---|---|---|
| ORM models | Unit | In-memory SQLite |
| Bioinformatics utils | Unit | Sample PDB files, known sequences |
| External API clients | Unit (mocked) | Mocked httpx responses |
| Services | Integration | SQLite + mocked external APIs |
| API routes | E2E | Full stack with SQLite, mocked external APIs |
| Fpocket integration | Integration | Requires fpocket binary (skip if not installed) |
Sample Test Fixtures¶
# tests/conftest.py additions
SAMPLE_PDB_DATA = """HEADER HYDROLASE 01-JAN-00 XXXX
ATOM 1 N ALA A 1 1.000 2.000 3.000 1.00 0.00 N
ATOM 2 CA ALA A 1 2.000 3.000 4.000 1.00 0.00 C
...
END
"""
SAMPLE_SEQUENCE = "MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFEDHFLSLQ"
@pytest.fixture
async def protein_target(db_session):
target = ProteinTarget(
name="Test Protein",
uniprot_id="P00533",
gene_name="EGFR",
organism="Homo sapiens",
sequence=SAMPLE_SEQUENCE,
sequence_length=len(SAMPLE_SEQUENCE),
)
db_session.add(target)
await db_session.commit()
await db_session.refresh(target)
return target
@pytest.fixture
async def protein_structure(db_session, protein_target):
structure = ProteinStructure(
protein_target_id=protein_target.id,
pdb_id="1M17",
source="rcsb",
file_format="pdb",
structure_data=SAMPLE_PDB_DATA,
resolution=2.6,
method="xray",
chains=["A"],
has_ligand=True,
ligand_ids=["AQ4"],
)
db_session.add(structure)
await db_session.commit()
await db_session.refresh(structure)
return structure

