Skip to contents

Get from zero to querying protein structure data in 5 minutes.

Install the package

# install.packages("remotes")
remotes::install_github("TeamMacLean/tslstructures")

Discover available datasets

See what datasets are available in the TSL Structures community:

# List all available datasets
list_datasets(sandbox = TRUE)
#> # A tibble: 1 x 6
#>   record_id title                   doi                   version created    description
#>   <chr>     <chr>                   <chr>                 <chr>   <chr>      <chr>
#> 1 415123    TSL Test Structure Da…
 10.5072/zenodo.415123 1.0.0   2024-12-… Test dataset...

# Search datasets by text
list_datasets(query = "arabidopsis", sandbox = TRUE)

Filter by dataset contents

Use full = TRUE to get detailed metadata for filtering:

# Get full metadata (fetches datapackage.json from each dataset)
datasets <- list_datasets(full = TRUE, sandbox = TRUE)

# Filter by predictor
datasets |>
  filter(has_predictor(predictors, "alphafold3"))

# Filter by size and structure count
datasets |>
  filter(is_small(total_size_bytes),
         has_min_structures(structure_count, 50))

# Multiple conditions
datasets |>
  filter(has_predictor(predictors, c("alphafold3", "boltz2")),
         has_format(formats, "cif"))

Get detailed information about a specific dataset:

info <- dataset_info(415123, sandbox = TRUE)

info$title
#> [1] "TSL Test Structure Dataset"

info$files
#> # A tibble: 6 x 3
#>   filename                         size checksum
#>   <chr>                           <dbl> <chr>
#> 1 datapackage.json                 2048 md5:...
#> 2 metadata.parquet                12288 md5:...
#> 3 predictions/scores.parquet      8192 md5:...
#> ...

Install the dataset

Download to your local cache:

install_dataset(415123, sandbox = TRUE)
#> i Fetching dataset information from Zenodo...
#> v Found: TSL Test Structure Dataset
#> i Downloading 6 file(s)...
#> v Dataset installed

Check what you have installed:

installed_datasets()
#> # A tibble: 1 x 5
#>   record_id title                     doi                    installed_at path
#>   <chr>     <chr>                     <chr>                  <chr>        <chr>
#> 1 415123    TSL Test Structure Datas~ 10.5072/zenodo.415123  2024-12-16~

Load and query

Load the combined metadata and prediction scores:

data <- load_dataset(415123, lazy = FALSE)

data
#> # A tibble: 62 x 15
#>    structure_id   protein_id predictor    n_chains n_residues plddt_mean ptm
#>    <chr>          <chr>      <chr>           <int>      <int>      <dbl> <dbl>
#>  1 AT1G01010_af2  AT1G01010  alphafold2          1        429       82.3  NA
#>  2 AT1G01010_af3  AT1G01010  alphafold3          1        429       85.1  0.82
#>  ...

Filter with dplyr:

# High-confidence predictions
data |>
  filter(plddt_mean > 85) |>
  select(structure_id, protein_id, predictor, plddt_mean, ptm)

# Compare predictors for a protein
data |>
  filter(protein_id == "AT1G01010") |>
  select(structure_id, predictor, plddt_mean, ptm)

Load individual tables

For more control, load metadata and predictions separately:

# Structure metadata only
meta <- load_metadata(415123, lazy = FALSE)

# Prediction scores only
scores <- load_predictions(415123, lazy = FALSE)

Lazy evaluation for large datasets

For large datasets, use lazy evaluation to filter before loading into memory:

# Returns Arrow Table (not loaded into memory)
data <- load_dataset(415123, lazy = TRUE)

# Filter and collect only matching rows
high_quality <- data |>
  filter(plddt_mean > 90) |>
  collect()

Clean up

# Remove a dataset
remove_dataset(415123)

# Check cache size
cache_size()

# Clear entire cache
clear_cache()

What’s next

For data producers

Want to create your own TSP dataset? See the Creating TSP Datasets guide.