Cross-Dataset Workflows • tslstructures

This vignette shows how to work with multiple TSP datasets simultaneously - querying, searching, and comparing across datasets.

library(tslstructures)
library(dplyr)

Installing Multiple Datasets

# Install datasets (example IDs)
install_dataset(415123, sandbox = TRUE)
install_dataset(415124, sandbox = TRUE)  # hypothetical second dataset

# See what's installed
installed_datasets()
#> # A tibble: 2 x 5
#>   record_id title                 doi                   installed_at path
#>   <chr>     <chr>                 <chr>                 <chr>        <chr>
#> 1 415123    TSL Test Dataset      10.5072/zenodo.415123 2024-12-16   /...
#> 2 415124    Another Dataset       10.5072/zenodo.415124 2024-12-16   /...

Querying Across Datasets

Basic Query

query_structures() combines data from multiple datasets:

# Query all installed datasets
all_data <- query_structures()

# Each row has a dataset_id column
all_data |>
  count(dataset_id)
#> # A tibble: 2 x 2
#>   dataset_id     n
#>   <chr>      <int>
#> 1 415123        62
#> 2 415124       150

# Filter across all datasets
high_quality <- all_data |>
  filter(mean_plddt > 85)

Query Specific Datasets

# Query only certain datasets
subset <- query_structures(c(415123, 415124))

# Or just one
single <- query_structures(415123)

Control What’s Loaded

# Metadata only (faster, smaller)
meta_only <- query_structures(include = "metadata")

# Predictions only
preds_only <- query_structures(include = "predictions")

# Both joined (default)
both <- query_structures(include = "both")

Handling Different Schemas

Datasets may have different columns. tslstructures handles this automatically:

# See which columns exist across datasets
list_columns()
#> # A tibble: 15 x 3
#>    column            datasets in_all
#>    <chr>                <int> <lgl>
#>  1 id                       2 TRUE
#>  2 structure_id             2 TRUE
#>  3 filename                 2 TRUE
#>  4 mean_plddt               2 TRUE
#>  5 special_column           1 FALSE
#>  ...

# Columns missing in some datasets are filled with NA
query_structures() |>
  select(id, dataset_id, special_column) |>
  head()
#> # A tibble: 6 x 3
#>   id           dataset_id special_column
#>   <chr>        <chr>      <chr>
#> 1 P00571_AF3_1 415123     NA
#> 2 P00571_AF3_2 415123     NA
#> 3 SOME_ID      415124     "has value"

Comparing Datasets

Get summary statistics across datasets:

compare_datasets()
#> # A tibble: 2 x 5
#>   dataset_id n_structures n_proteins predictors              mean_plddt
#>   <chr>             <int>      <int> <chr>                        <dbl>
#> 1 415123               62          7 alphafold3, boltz2            82.5
#> 2 415124              150         50 alphafold2, alphafold3        78.3

Searching Across Datasets

Text Pattern Search

# Search for structures matching a pattern
search_structures("kinase")
#> v Found 23 match(es) in 2 dataset(s)
#> # A tibble: 23 x 15
#>   id              structure_id  ... dataset_id match_field
#>   <chr>           <chr>             <chr>      <chr>
#> 1 KINASE1_AF3_1   KINASE1       ... 415123     id
#> 2 KINASE1_AF3_2   KINASE1       ... 415123     id
#> ...

# Search specific fields
search_structures("P00571", fields = c("id", "structure_id"))

# Case-sensitive search
search_structures("AT1G", ignore_case = FALSE)

# Regex patterns
search_structures("^AT[0-9]G")

Find by Protein ID

# Find all structures for a protein
find_protein("P00571")
#> # A tibble: 12 x 14
#>   id           structure_id prediction_source model_rank mean_plddt dataset_id
#>   <chr>        <chr>        <chr>                  <int>      <dbl> <chr>
#> 1 P00571_AF3_1 P00571       alphafold3                 1       89.2 415123
#> 2 P00571_AF3_2 P00571       alphafold3                 2       87.8 415123
#> 3 P00571_BZ2_1 P00571       boltz2                     1       85.4 415123
#> ...

# Find multiple proteins
find_protein(c("P00571", "Q9LPW0"))

# Substring matching
find_protein("P005", exact = FALSE)

Find Best Structures

Get the highest-quality structure for each protein:

# Best structure per protein (by pLDDT)
best <- find_best()
best
#> # A tibble: 57 x 14
#>   id           structure_id mean_plddt dataset_id ...
#>   <chr>        <chr>             <dbl> <chr>
#> 1 P00571_AF3_1 P00571             89.2 415123
#> 2 Q9LPW0_AF3_1 Q9LPW0             91.5 415123
#> ...

# Only high-confidence structures
best_high <- find_best(min_plddt = 85)

# From specific datasets
best_subset <- find_best(record_ids = 415123)

Common Workflows

Compare Predictors

query_structures() |>
  group_by(prediction_source) |>
  summarise(
    n = n(),
    mean_plddt = mean(mean_plddt, na.rm = TRUE),
    mean_ptm = mean(ptm_score, na.rm = TRUE),
    .groups = "drop"
  ) |>
  arrange(desc(mean_plddt))
#> # A tibble: 3 x 4
#>   prediction_source     n mean_plddt mean_ptm
#>   <chr>             <int>      <dbl>    <dbl>
#> 1 alphafold3           80       85.2    0.82
#> 2 boltz2               70       82.1    0.79
#> 3 alphafold2           62       78.5    NA

Find Structures Across Predictors

# Proteins with structures from multiple predictors
query_structures() |>
  group_by(structure_id) |>
  summarise(
    predictors = paste(unique(prediction_source), collapse = ", "),
    n_predictors = n_distinct(prediction_source),
    best_plddt = max(mean_plddt, na.rm = TRUE),
    .groups = "drop"
  ) |>
  filter(n_predictors > 1) |>
  arrange(desc(n_predictors))

Export Best Structures

# Get paths to best structure files
best <- find_best(min_plddt = 80)

# Extract structure files
paths <- lapply(seq_len(nrow(best)), function(i) {
  get_structure(best$id[i], best$dataset_id[i])
})

# Copy to output directory
output_dir <- "best_structures/"
dir.create(output_dir, showWarnings = FALSE)

for (i in seq_along(paths)) {
  file.copy(paths[[i]], file.path(output_dir, basename(paths[[i]])))
}

Summary

Function	Purpose
`query_structures()`	Combine data from multiple datasets
`list_columns()`	See column availability
`compare_datasets()`	Summary statistics
`search_structures()`	Regex pattern search
`find_protein()`	Find by protein ID
`find_best()`	Best structure per protein

Key points: - All functions add dataset_id column to identify source - Missing columns are filled with NA when combining - Regex patterns supported in search_structures() - find_best() selects by pLDDT score