This vignette shows how to work with multiple TSP datasets simultaneously - querying, searching, and comparing across datasets.
Installing Multiple Datasets
# Install datasets (example IDs)
install_dataset(415123, sandbox = TRUE)
install_dataset(415124, sandbox = TRUE) # hypothetical second dataset
# See what's installed
installed_datasets()
#> # A tibble: 2 x 5
#> record_id title doi installed_at path
#> <chr> <chr> <chr> <chr> <chr>
#> 1 415123 TSL Test Dataset 10.5072/zenodo.415123 2024-12-16 /...
#> 2 415124 Another Dataset 10.5072/zenodo.415124 2024-12-16 /...Querying Across Datasets
Basic Query
query_structures() combines data from multiple
datasets:
# Query all installed datasets
all_data <- query_structures()
# Each row has a dataset_id column
all_data |>
count(dataset_id)
#> # A tibble: 2 x 2
#> dataset_id n
#> <chr> <int>
#> 1 415123 62
#> 2 415124 150
# Filter across all datasets
high_quality <- all_data |>
filter(mean_plddt > 85)Query Specific Datasets
# Query only certain datasets
subset <- query_structures(c(415123, 415124))
# Or just one
single <- query_structures(415123)Control What’s Loaded
# Metadata only (faster, smaller)
meta_only <- query_structures(include = "metadata")
# Predictions only
preds_only <- query_structures(include = "predictions")
# Both joined (default)
both <- query_structures(include = "both")Handling Different Schemas
Datasets may have different columns. tslstructures
handles this automatically:
# See which columns exist across datasets
list_columns()
#> # A tibble: 15 x 3
#> column datasets in_all
#> <chr> <int> <lgl>
#> 1 id 2 TRUE
#> 2 structure_id 2 TRUE
#> 3 filename 2 TRUE
#> 4 mean_plddt 2 TRUE
#> 5 special_column 1 FALSE
#> ...
# Columns missing in some datasets are filled with NA
query_structures() |>
select(id, dataset_id, special_column) |>
head()
#> # A tibble: 6 x 3
#> id dataset_id special_column
#> <chr> <chr> <chr>
#> 1 P00571_AF3_1 415123 NA
#> 2 P00571_AF3_2 415123 NA
#> 3 SOME_ID 415124 "has value"Comparing Datasets
Get summary statistics across datasets:
compare_datasets()
#> # A tibble: 2 x 5
#> dataset_id n_structures n_proteins predictors mean_plddt
#> <chr> <int> <int> <chr> <dbl>
#> 1 415123 62 7 alphafold3, boltz2 82.5
#> 2 415124 150 50 alphafold2, alphafold3 78.3Searching Across Datasets
Text Pattern Search
# Search for structures matching a pattern
search_structures("kinase")
#> v Found 23 match(es) in 2 dataset(s)
#> # A tibble: 23 x 15
#> id structure_id ... dataset_id match_field
#> <chr> <chr> <chr> <chr>
#> 1 KINASE1_AF3_1 KINASE1 ... 415123 id
#> 2 KINASE1_AF3_2 KINASE1 ... 415123 id
#> ...
# Search specific fields
search_structures("P00571", fields = c("id", "structure_id"))
# Case-sensitive search
search_structures("AT1G", ignore_case = FALSE)
# Regex patterns
search_structures("^AT[0-9]G")Find by Protein ID
# Find all structures for a protein
find_protein("P00571")
#> # A tibble: 12 x 14
#> id structure_id prediction_source model_rank mean_plddt dataset_id
#> <chr> <chr> <chr> <int> <dbl> <chr>
#> 1 P00571_AF3_1 P00571 alphafold3 1 89.2 415123
#> 2 P00571_AF3_2 P00571 alphafold3 2 87.8 415123
#> 3 P00571_BZ2_1 P00571 boltz2 1 85.4 415123
#> ...
# Find multiple proteins
find_protein(c("P00571", "Q9LPW0"))
# Substring matching
find_protein("P005", exact = FALSE)Find Best Structures
Get the highest-quality structure for each protein:
# Best structure per protein (by pLDDT)
best <- find_best()
best
#> # A tibble: 57 x 14
#> id structure_id mean_plddt dataset_id ...
#> <chr> <chr> <dbl> <chr>
#> 1 P00571_AF3_1 P00571 89.2 415123
#> 2 Q9LPW0_AF3_1 Q9LPW0 91.5 415123
#> ...
# Only high-confidence structures
best_high <- find_best(min_plddt = 85)
# From specific datasets
best_subset <- find_best(record_ids = 415123)Common Workflows
Compare Predictors
query_structures() |>
group_by(prediction_source) |>
summarise(
n = n(),
mean_plddt = mean(mean_plddt, na.rm = TRUE),
mean_ptm = mean(ptm_score, na.rm = TRUE),
.groups = "drop"
) |>
arrange(desc(mean_plddt))
#> # A tibble: 3 x 4
#> prediction_source n mean_plddt mean_ptm
#> <chr> <int> <dbl> <dbl>
#> 1 alphafold3 80 85.2 0.82
#> 2 boltz2 70 82.1 0.79
#> 3 alphafold2 62 78.5 NAFind Structures Across Predictors
# Proteins with structures from multiple predictors
query_structures() |>
group_by(structure_id) |>
summarise(
predictors = paste(unique(prediction_source), collapse = ", "),
n_predictors = n_distinct(prediction_source),
best_plddt = max(mean_plddt, na.rm = TRUE),
.groups = "drop"
) |>
filter(n_predictors > 1) |>
arrange(desc(n_predictors))Export Best Structures
# Get paths to best structure files
best <- find_best(min_plddt = 80)
# Extract structure files
paths <- lapply(seq_len(nrow(best)), function(i) {
get_structure(best$id[i], best$dataset_id[i])
})
# Copy to output directory
output_dir <- "best_structures/"
dir.create(output_dir, showWarnings = FALSE)
for (i in seq_along(paths)) {
file.copy(paths[[i]], file.path(output_dir, basename(paths[[i]])))
}Summary
| Function | Purpose |
|---|---|
query_structures() |
Combine data from multiple datasets |
list_columns() |
See column availability |
compare_datasets() |
Summary statistics |
search_structures() |
Regex pattern search |
find_protein() |
Find by protein ID |
find_best() |
Best structure per protein |
Key points: - All functions add dataset_id column to
identify source - Missing columns are filled with NA when combining -
Regex patterns supported in search_structures() -
find_best() selects by pLDDT score