Accessing Structures and PAE Data
Source:vignettes/accessing-structures.Rmd
accessing-structures.RmdThis vignette shows how to extract structure files and PAE (Predicted Aligned Error) matrices from installed TSP datasets.
Prerequisites
First, install a dataset:
# Install from Zenodo sandbox (test dataset)
install_dataset(415123, sandbox = TRUE)Finding Structure IDs
Each structure has a unique id in the metadata. This is
different from structure_id, which is the protein
identifier (not unique when multiple models exist per protein).
meta <- load_metadata(415123, lazy = FALSE)
# View available structures
meta |>
select(id, structure_id, prediction_source, model_rank, mean_plddt)
#> # A tibble: 62 x 5
#> id structure_id prediction_source model_rank mean_plddt
#> <chr> <chr> <chr> <int> <dbl>
#> 1 P00571_AF3_1 P00571 alphafold3 1 89.2
#> 2 P00571_AF3_2 P00571 alphafold3 2 87.8
#> 3 P00571_AF3_3 P00571 alphafold3 3 86.4
#> ...
# Find the best model for a protein
meta |>
filter(structure_id == "P00571") |>
arrange(desc(mean_plddt)) |>
slice(1) |>
pull(id)
#> [1] "P00571_AF3_1"Extracting Structure Files
Use get_structure() to extract PDB or CIF files from the
compressed batches:
# Get a single structure
path <- get_structure("P00571_AF3_1", 415123)
path
#> P00571_AF3_1
#> "/path/to/.extracted/structures/P00571_AF3_1.cif"
# The file is now available on disk
file.exists(path)
#> [1] TRUE
# Get multiple structures at once
ids <- c("P00571_AF3_1", "P00571_AF3_2", "P00571_BZ2_1")
paths <- get_structure(ids, 415123)
paths
#> P00571_AF3_1 P00571_AF3_2 P00571_BZ2_1
#> "/path/to/.../P00571_AF3_1.cif" "/path/to/.../P00571_AF3_2.cif" "/path/to/.../P00571_BZ2_1.pdb"Extraction Caching
Structures are cached after first extraction. Subsequent calls return the cached path without re-extracting:
# First call extracts from tar.gz
system.time(path1 <- get_structure("P00571_AF3_1", 415123))
#> user system elapsed
#> 0.05 0.02 0.12
# Second call uses cache (instant)
system.time(path2 <- get_structure("P00571_AF3_1", 415123))
#> user system elapsed
#> 0.00 0.00 0.00
identical(path1, path2)
#> [1] TRUELoading PAE Matrices
PAE (Predicted Aligned Error) matrices show the expected position error between residue pairs. Lower values indicate higher confidence in the relative positions.
# Get PAE as a matrix
pae <- get_pae("P00571_AF3_1", 415123)
dim(pae)
#> [1] 194 194
# Values are in Angstroms (lower = better)
range(pae)
#> [1] 0.45 28.31PAE Output Formats
get_pae() supports three output formats:
# Matrix (default) - good for heatmaps
pae_matrix <- get_pae("P00571_AF3_1", 415123, as = "matrix")
class(pae_matrix)
#> [1] "matrix" "array"
# Data frame (long format) - good for ggplot2
pae_df <- get_pae("P00571_AF3_1", 415123, as = "data.frame")
head(pae_df)
#> residue_i residue_j pae
#> 1 1 1 0.45
#> 2 2 1 1.23
#> 3 3 1 2.87
#> ...
# List (raw JSON) - includes all metadata
pae_list <- get_pae("P00571_AF3_1", 415123, as = "list")
names(pae_list)
#> [1] "id" "prediction_source" "residue_count" "pae"Visualizing PAE
# Simple base R heatmap
pae <- get_pae("P00571_AF3_1", 415123)
image(pae,
col = hcl.colors(50, "YlOrRd", rev = TRUE),
main = "PAE Matrix",
xlab = "Residue i",
ylab = "Residue j")
# ggplot2 version
library(ggplot2)
pae_df <- get_pae("P00571_AF3_1", 415123, as = "data.frame")
ggplot(pae_df, aes(residue_i, residue_j, fill = pae)) +
geom_raster() +
scale_fill_viridis_c(option = "rocket", direction = -1,
name = "PAE (A)") +
coord_fixed() +
labs(title = "Predicted Aligned Error",
x = "Residue i", y = "Residue j") +
theme_minimal()Per-Residue pLDDT
Per-residue pLDDT scores may be available in the PAE JSON files:
plddt <- get_plddt("P00571_AF3_1", 415123)
if (!is.null(plddt)) {
# Plot pLDDT along sequence
plot(plddt, type = "l",
xlab = "Residue", ylab = "pLDDT",
main = "Per-residue confidence")
abline(h = c(70, 90), lty = 2, col = "gray")
} else {
# Per-residue pLDDT not in this dataset
# Use mean_plddt from metadata instead
meta <- load_metadata(415123, lazy = FALSE)
meta |>
filter(id == "P00571_AF3_1") |>
pull(mean_plddt)
}Batch Processing
Process multiple structures efficiently:
# Get all high-confidence structures for a protein
meta <- load_metadata(415123, lazy = FALSE)
best_models <- meta |>
filter(structure_id == "P00571", mean_plddt > 85) |>
pull(id)
# Extract all at once
paths <- get_structure(best_models, 415123)
# Process each structure
for (id in names(paths)) {
cat("Processing:", id, "\n")
# Load PAE
pae <- get_pae(id, 415123)
# Calculate mean PAE (overall confidence)
mean_pae <- mean(pae)
cat(" Mean PAE:", round(mean_pae, 2), "A\n")
}Managing Disk Space
Extracted files are cached in the dataset directory. Clear them when done:
# Clear extraction cache for a dataset
clear_extracted(415123)
# Or remove the entire dataset
remove_dataset(415123)Summary
| Function | Purpose |
|---|---|
get_structure(id, record_id) |
Extract structure file(s) from batch |
get_pae(id, record_id, as) |
Load PAE matrix |
get_plddt(id, record_id) |
Load per-residue pLDDT |
clear_extracted(record_id) |
Clear extraction cache |
Key points: - Use id column (unique) to identify
structures, not structure_id (protein ID) - Extracted files
are cached - first call extracts, subsequent calls are instant - PAE
values are in Angstroms - lower is better - Per-residue pLDDT may not be
available in all datasets