Accessing Structures and PAE Data • tslstructures

This vignette shows how to extract structure files and PAE (Predicted Aligned Error) matrices from installed TSP datasets.

library(tslstructures)
library(dplyr)

Prerequisites

First, install a dataset:

# Install from Zenodo sandbox (test dataset)
install_dataset(415123, sandbox = TRUE)

Finding Structure IDs

Each structure has a unique id in the metadata. This is different from structure_id, which is the protein identifier (not unique when multiple models exist per protein).

meta <- load_metadata(415123, lazy = FALSE)

# View available structures
meta |>
  select(id, structure_id, prediction_source, model_rank, mean_plddt)
#> # A tibble: 62 x 5
#>    id           structure_id prediction_source model_rank mean_plddt
#>    <chr>        <chr>        <chr>                  <int>      <dbl>
#>  1 P00571_AF3_1 P00571       alphafold3                 1       89.2
#>  2 P00571_AF3_2 P00571       alphafold3                 2       87.8
#>  3 P00571_AF3_3 P00571       alphafold3                 3       86.4
#>  ...

# Find the best model for a protein
meta |>
  filter(structure_id == "P00571") |>
  arrange(desc(mean_plddt)) |>
  slice(1) |>
  pull(id)
#> [1] "P00571_AF3_1"

Extracting Structure Files

Use get_structure() to extract PDB or CIF files from the compressed batches:

# Get a single structure
path <- get_structure("P00571_AF3_1", 415123)
path
#>              P00571_AF3_1
#> "/path/to/.extracted/structures/P00571_AF3_1.cif"

# The file is now available on disk
file.exists(path)
#> [1] TRUE

# Get multiple structures at once
ids <- c("P00571_AF3_1", "P00571_AF3_2", "P00571_BZ2_1")
paths <- get_structure(ids, 415123)
paths
#>              P00571_AF3_1              P00571_AF3_2              P00571_BZ2_1
#> "/path/to/.../P00571_AF3_1.cif" "/path/to/.../P00571_AF3_2.cif" "/path/to/.../P00571_BZ2_1.pdb"

Extraction Caching

Structures are cached after first extraction. Subsequent calls return the cached path without re-extracting:

# First call extracts from tar.gz
system.time(path1 <- get_structure("P00571_AF3_1", 415123))
#>    user  system elapsed
#>   0.05    0.02    0.12

# Second call uses cache (instant)
system.time(path2 <- get_structure("P00571_AF3_1", 415123))
#>    user  system elapsed
#>   0.00    0.00    0.00

identical(path1, path2)
#> [1] TRUE

Loading PAE Matrices

PAE (Predicted Aligned Error) matrices show the expected position error between residue pairs. Lower values indicate higher confidence in the relative positions.

# Get PAE as a matrix
pae <- get_pae("P00571_AF3_1", 415123)

dim(pae)
#> [1] 194 194

# Values are in Angstroms (lower = better)
range(pae)
#> [1]  0.45 28.31

PAE Output Formats

get_pae() supports three output formats:

# Matrix (default) - good for heatmaps
pae_matrix <- get_pae("P00571_AF3_1", 415123, as = "matrix")
class(pae_matrix)
#> [1] "matrix" "array"

# Data frame (long format) - good for ggplot2
pae_df <- get_pae("P00571_AF3_1", 415123, as = "data.frame")
head(pae_df)
#>   residue_i residue_j   pae
#> 1         1         1  0.45
#> 2         2         1  1.23
#> 3         3         1  2.87
#> ...

# List (raw JSON) - includes all metadata
pae_list <- get_pae("P00571_AF3_1", 415123, as = "list")
names(pae_list)
#> [1] "id" "prediction_source" "residue_count" "pae"

Visualizing PAE

# Simple base R heatmap
pae <- get_pae("P00571_AF3_1", 415123)
image(pae,
      col = hcl.colors(50, "YlOrRd", rev = TRUE),
      main = "PAE Matrix",
      xlab = "Residue i",
      ylab = "Residue j")

# ggplot2 version
library(ggplot2)

pae_df <- get_pae("P00571_AF3_1", 415123, as = "data.frame")

ggplot(pae_df, aes(residue_i, residue_j, fill = pae)) +
  geom_raster() +
  scale_fill_viridis_c(option = "rocket", direction = -1,
                       name = "PAE (A)") +
  coord_fixed() +
  labs(title = "Predicted Aligned Error",
       x = "Residue i", y = "Residue j") +
  theme_minimal()

Per-Residue pLDDT

Per-residue pLDDT scores may be available in the PAE JSON files:

plddt <- get_plddt("P00571_AF3_1", 415123)

if (!is.null(plddt)) {
  # Plot pLDDT along sequence
  plot(plddt, type = "l",
       xlab = "Residue", ylab = "pLDDT",
       main = "Per-residue confidence")
  abline(h = c(70, 90), lty = 2, col = "gray")
} else {
  # Per-residue pLDDT not in this dataset
  # Use mean_plddt from metadata instead
  meta <- load_metadata(415123, lazy = FALSE)
  meta |>
    filter(id == "P00571_AF3_1") |>
    pull(mean_plddt)
}

Batch Processing

Process multiple structures efficiently:

# Get all high-confidence structures for a protein
meta <- load_metadata(415123, lazy = FALSE)

best_models <- meta |>
  filter(structure_id == "P00571", mean_plddt > 85) |>
  pull(id)

# Extract all at once
paths <- get_structure(best_models, 415123)

# Process each structure
for (id in names(paths)) {
  cat("Processing:", id, "\n")

  # Load PAE
  pae <- get_pae(id, 415123)

  # Calculate mean PAE (overall confidence)
  mean_pae <- mean(pae)
  cat("  Mean PAE:", round(mean_pae, 2), "A\n")
}

Managing Disk Space

Extracted files are cached in the dataset directory. Clear them when done:

# Clear extraction cache for a dataset
clear_extracted(415123)

# Or remove the entire dataset
remove_dataset(415123)

Summary

Function	Purpose
`get_structure(id, record_id)`	Extract structure file(s) from batch
`get_pae(id, record_id, as)`	Load PAE matrix
`get_plddt(id, record_id)`	Load per-residue pLDDT
`clear_extracted(record_id)`	Clear extraction cache

Key points: - Use id column (unique) to identify structures, not structure_id (protein ID) - Extracted files are cached - first call extracts, subsequent calls are instant - PAE values are in Angstroms - lower is better - Per-residue pLDDT may not be available in all datasets