Skip to content

Using Datasets

This tutorial shows how to find, download, and work with TSP datasets using the tslstructures R package.

Prerequisites

library(tslstructures)
library(dplyr)

1. Discover Datasets

Find datasets in the TSL Structures community:

# List all available datasets
datasets <- list_datasets()

# View basic info
datasets
#> # A tibble: 5 × 6
#>   record_id title                              doi                version created
#>   <chr>     <chr>                              <chr>              <chr>   <chr>
#> 1 12345678  Arabidopsis Proteome Structures    10.5281/zenodo...  1.0.0   2024-...
#> 2 12345679  Plant Pathogen Effector Structures 10.5281/zenodo...  1.0.0   2024-...

Search and Filter

# Get extended metadata for filtering
datasets <- list_datasets(full = TRUE)

# Filter by predictor
datasets |>
  filter(has_predictor(predictors, "alphafold3"))

# Filter by size
datasets |>
  filter(is_small(total_size_bytes))  # < 100 MB

# Filter by structure count
datasets |>
  filter(has_min_structures(structure_count, 1000))

2. Install a Dataset

Download and cache a dataset locally:

# Install by DOI
install_dataset("10.5281/zenodo.12345678")

# Or by record ID
install_dataset("12345678")

# Check what's installed
installed_datasets()

The dataset is downloaded once and cached for future use.


3. Load Dataset

# Load an installed dataset
ds <- load_dataset("arabidopsis-structures")

# View summary
ds
#> TSP Dataset: arabidopsis-structures
#> Structures: 25,432
#> Proteins: 8,234
#> Sources: alphafold2, alphafold3, boltz2

4. Query Metadata

Filter structures without loading them:

# Find high-confidence structures
high_conf <- ds |>
  filter(plddt_mean > 90, ptm > 0.85)

# Find structures for a specific protein
my_protein <- ds |>
  filter(protein_id == "P12345")

# Find rank-1 predictions only
best_models <- ds |>
  filter(rank == 1)

# Combine filters
kinases <- ds |>
  filter(
    plddt_mean > 85,
    grepl("kinase", annotation, ignore.case = TRUE),
    source == "alphafold3"
  )

5. Extract Structures

Get actual structure files:

# Get a single structure
structure <- get_structure(ds, "P12345_AF3_1")
#> Extracting P12345_AF3_1.cif from batch_003.tar.gz

# Returns the file path
structure
#> [1] "/path/to/cache/datasets/arabidopsis-structures/.extracted/P12345_AF3_1.cif"

# Get multiple structures
structures <- get_structures(ds, c("P12345_AF3_1", "P67890_BZ2_1"))

6. Get Confidence Data

pLDDT Scores

# Per-residue pLDDT
plddt <- get_plddt(ds, "P12345_AF3_1")
#>   residue plddt
#> 1       1  78.2
#> 2       2  82.5
#> 3       3  91.3
#> ...

# Plot pLDDT
plot(plddt$residue, plddt$plddt, type = "l",
     xlab = "Residue", ylab = "pLDDT")

PAE Matrices

# Predicted Aligned Error matrix
pae <- get_pae(ds, "P12345_AF3_1")

# As a matrix (default)
dim(pae)
#> [1] 234 234

# Visualize
image(pae, col = hcl.colors(50, "RdBu"))

7. Cross-Dataset Queries

Work with multiple datasets:

# Query across all installed datasets
all_kinases <- search_structures(
  pattern = "kinase",
  datasets = installed_datasets()$name
)

# Find a protein across datasets
find_protein("P12345")

# Compare datasets
compare_datasets(c("arabidopsis-structures", "pathogen-effectors"))

8. Cache Management

# Check cache size
cache_size()
#> Total: 2.3 GB

# Clear extracted files (keeps downloaded data)
clear_extracted()

# Remove a dataset entirely
remove_dataset("old-dataset")

# Clear everything
clear_cache()

Complete Example

library(tslstructures)
library(dplyr)

# Find datasets with AlphaFold3 predictions
datasets <- list_datasets(full = TRUE) |>
  filter(has_predictor(predictors, "alphafold3"))

# Install the first one
install_dataset(datasets$record_id[1])

# Load and explore
ds <- load_dataset(datasets$name[1])

# Find high-confidence kinase structures
targets <- ds |>
  filter(
    plddt_mean > 90,
    ptm > 0.85,
    source == "alphafold3"
  ) |>
  head(10)

# Extract structures
for (id in targets$structure_id) {
  path <- get_structure(ds, id)
  message("Extracted: ", path)
}

Next Steps