Access via API and R

An application programming interface (API) is a way for two or more computer programs to communicate with each other. More information about how to amend API links can be found here. Useful introductions to using APIs in R can be found here.

There are three tables the user can pull from the API. Learn more about them on the FOSS data description page. Here, you can see them in their raw JSON format:

Here are some examples of how to use the data with R:

12.1 Ex. Load all rows of the catch, haul, and species data tables

Note that without specifying, a basic query to the API will only return 25 entries.

12.1.1 Load haul data

# link to the API
api_link_haul <- 'https://apps-st.fisheries.noaa.gov/ods/foss/afsc_groundfish_survey_haul/'

12.1.1.1 Load first 25 rows of data

res <- httr::GET(url = api_link_haul)
# res ## Test connection

## convert from JSON format
dat <- jsonlite::fromJSON(base::rawToChar(res$content))$items

# Find how many rows and columns are in the data pull
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))

12.1.1.2 Load all data:

Since the maxim number of rows a user can pull is 10,000 rows in a query, the user needs to cycle through by offsetting to the next 10,000 rows (as is shown here).

dat <- data.frame()
for (i in seq(0, 500000, 10000)){
  ## find how many iterations it takes to cycle through the data
  print(i)
  ## query the API link
  res <- httr::GET(url = paste0(api_link_haul, "?offset=",i,"&limit=10000"))
  ## convert from JSON format
  data <- jsonlite::fromJSON(base::rawToChar(res$content)) 
  
  ## if there are no data, stop the loop
  if (is.null(nrow(data$items))) {
    break
  }
  
  ## bind sub-pull to dat data.frame
  dat <- dplyr::bind_rows(dat, 
                          data$items |>
                            dplyr::select(-links)) # necessary for API accounting, but not part of the dataset)
}

Explore the data contents:

# Find how many rows and columns are in the data pull
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))

# learn about the structure of the data
summary(dat)

# Print the first few lines of the data 
dat |> 
  head(3) |> 
  flextable::flextable() |>
  flextable::colformat_num(
      j = c("year", "cruise", "cruisejoin"), 
      big.mark = "") |> 
  flextable::theme_zebra()

# save outputs for later comparison
dat_haul_api <- dat

12.1.2 Load catch data

# link to the API
api_link_catch <- 'https://apps-st.fisheries.noaa.gov/ods/foss/afsc_groundfish_survey_catch/'

12.1.2.1 Load first 25 rows of data

res <- httr::GET(url = api_link_catch)
# res ## Test connection

## convert from JSON format
dat <- jsonlite::fromJSON(base::rawToChar(res$content))$items

# Find how many rows and columns are in the data pull
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))

12.1.2.2 Load all data

Since the maxim number of rows a user can pull is 10,000 rows in a query, the user needs to cycle through by offsetting to the next 10,000 rows (as is shown here).

dat <- data.frame()
# for (i in seq(0, 100000, 10000)){
for (i in seq(0, 1000000, 10000)){
  ## find how many iterations it takes to cycle through the data
  # print(i)
  ## query the API link
  res <- httr::GET(url = paste0(api_link_catch, "?offset=",i,"&limit=10000"))
  ## convert from JSON format
  data <- jsonlite::fromJSON(base::rawToChar(res$content)) 
  
  ## if there are no data, stop the loop
  if (is.null(nrow(data$items))) {
    break
  }
  
  ## bind sub-pull to dat data.frame
  dat <- dplyr::bind_rows(dat, 
                          data$items |>
                            dplyr::select(-links)) # necessary for API accounting, but not part of the dataset)
}

Explore the data contents:

# Find how many rows and columns are in the data pull
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))

# learn about the structure of the data
summary(dat)

# Print the first few lines of the data 
dat |> 
  head(3) |> 
  flextable::flextable() |>
  flextable::colformat_num(
      j = c("species_code"), 
      big.mark = "") |> 
  flextable::theme_zebra()

# save outputs for later comparison
dat_catch_api <- dat

12.1.3 Load species data

Since there are less than 10,000 rows of species data (and the maxim number of rows a user can pull from this API is 10,000 rows in a query), we can simply call ?offset=0&limit=10000 in our query call.

# link to the API
api_link_species <- 'https://apps-st.fisheries.noaa.gov/ods/foss/afsc_groundfish_survey_species/'
res <- httr::GET(url = paste0(api_link_species, "?offset=0&limit=10000"))

## convert from JSON format
data <- jsonlite::fromJSON(base::rawToChar(res$content))
dat <- data$items  |>
  dplyr::select(-links) # necessary for API accounting, but not part of the dataset

Explore the data contents:

# Find how many rows and columns are in the data pull
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))

# learn about the structure of the data
summary(dat)

# Print the first few lines of the data 
dat |> 
  head(3) |> 
  flextable::flextable() |>
  flextable::colformat_num(
      j = c("species_code", "worms", "itis"), # 
      big.mark = "") |> 
  flextable::theme_zebra()

# save outputs for later comparison
dat_species_api <- dat

12.2 Ex. Create zero-filled data using data loaded in last example

It is important to create and have access to zero-fill (presence and absence) so you can do simple analyses and plot data.

First prepare a table with all combinations of what species should be listed for what hauls/surveys. For zero-filled data, all species caught in a survey need to have zero or non-zero row entries for a haul

comb <- dplyr::full_join(
  # find all species that have been caught, by survey
  x = dplyr::left_join(dat_catch_api, dat_haul_api, by = "hauljoin") |>
    dplyr::select(survey_definition_id, species_code) |>
    dplyr::distinct(),
  # find all haul events (hauljoins), by survey
  y = dat_haul_api |>
    dplyr::select(survey_definition_id, hauljoin) |>
    dplyr::distinct(),
  relationship = "many-to-many",
  by = "survey_definition_id"
) |> 
  dplyr::select(-survey_definition_id) # now, redundant

Explore the data contents:

print(paste0("rows: ", nrow(comb), "; cols: ", ncol(comb)))

comb |> head(3) |> 
  flextable::flextable()  |>
  flextable::colformat_num(
      j = c("species_code", "hauljoin"), 
      big.mark = "") |> 
  flextable::theme_zebra()

Now, using that table of combinations (here, called comb), join data to make a full zero-filled CPUE dataset. When all of the data have been full joined together, there should be the maximum number of rows in comb.

dat <- comb |>
  # add species data
  dplyr::left_join(dat_species_api) |> # , "species_code"
  # add haul data
  dplyr::left_join(dat_haul_api) |> # , c("hauljoin")
  # add catch data
  dplyr::left_join(dat_catch_api) |> # , c("species_code", "hauljoin")
  # modify/clean up zero-filled rows
  dplyr::mutate(
    cpue_kgkm2 = ifelse(is.na(cpue_kgkm2), 0, cpue_kgkm2),
    cpue_nokm2 = ifelse(is.na(cpue_nokm2), 0, cpue_nokm2),
    count = ifelse(is.na(count), 0, count),
    weight_kg = ifelse(is.na(weight_kg), 0, weight_kg))

Explore the data contents:

# Find how many rows and columns are in the data pull
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))

# learn about the structure of the data
summary(dat)

# Print the first few lines of the data 
dat |> 
  head(3) |> 
  flextable::flextable() |>
  flextable::colformat_num(
      j = c("species_code", "hauljoin", "year", "cruise", "cruisejoin", "worms", "itis"), #
      big.mark = "") |> 
  flextable::theme_zebra()

# save outputs for later comparison
dat_zerofill_api <- dat

12.3 Ex. Visualize zero-filled data for 2023 eastern Bering Sea walleye pollock in CPUE data in distribution map

Using the zero-filled data from the previous example, we can make a few plots!

Here is some example data of 2023 through 2019 (year %in% 2019:2023) eastern and northern Bering Sea (srvy %in% c("EBS", "NBS)) walleye pollock (species_code == 21740).

dat <- dat_zerofill_api |> 
  dplyr::filter(year %in% 2019:2023 & 
                  srvy %in% c("EBS", "NBS") & 
                  species_code == 21740) |> 
  dplyr::select(year, common_name, longitude_dd_start, latitude_dd_start, cpue_kgkm2)

# Find how many rows and columns are in the data pull
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))

# # learn about the structure of the data
# summary(dat)

# Print the first few lines of the data 
dat |> 
  head(3) |> 
  flextable::flextable() |>
  flextable::colformat_num(
      j = c("year"), 
      big.mark = "") |> 
  flextable::theme_zebra()

12.3.1 Plot locations on map

library(ggplot2)

ggplot2::ggplot(data = dat |> dplyr::filter(cpue_kgkm2 != 0), 
                mapping = aes(x = longitude_dd_start, 
                              y = latitude_dd_start, 
                              size = cpue_kgkm2)) + 
  ggplot2::geom_point(alpha = .75) +
  ggplot2::geom_point(data = dat |> dplyr::filter(cpue_kgkm2 == 0), 
                      color = "red", 
                      shape = 17,
                      alpha = .75,
                      size = 3) +
  ggplot2::xlab("Longitude *W") +
  ggplot2::ylab("Latitude *N") +
  ggplot2::ggtitle(label = "CPUE (kg/km^2) of walleye pollock (Weight CPUE; kg/km2)", 
                   subtitle = "Eastern Bering Sea bottom trawl survey") +
  ggplot2::scale_size_continuous(name = "Weight (kg)") + 
  ggplot2::facet_wrap(facets = vars(year)) + 
  ggplot2::theme_bw()

12.3.2 Plot inverse-distance weighted plot of CPUE

This map is constructed using akgfmaps. To make IDW plots, you must have data from all stations surveyed, even if no fish of interest were found there.

These plots are similar to those published in the annual Bering Sea data reports.

# pak::pak("afsc-gap-products/akgfmaps")
library(akgfmaps)
idw <- akgfmaps::make_idw_stack(
  x = dat |> 
    dplyr::select(COMMON_NAME = common_name, 
                  CPUE_KGHA = cpue_kgkm2, 
                  LATITUDE = latitude_dd_start, 
                  LONGITUDE = longitude_dd_start, 
                  year), 
  grouping.vars = "year", 
  region = "bs.all", # Predefined EBS area
  set.breaks = "jenks", # Gets Jenks breaks from classint::classIntervals()
  in.crs = "+proj=longlat", # Set input coordinate reference system
  out.crs = "EPSG:3338", # Set output coordinate reference system
  extrapolation.grid.type = "sf")

shps <- akgfmaps::get_base_layers(
  select.region = "bs.all", 
  # include.corners = TRUE, 
  set.crs = "EPSG:3338")

shps$survey.area$SRVY <- c("EBS", "NBS")
shps$survey.area$SURVEY <- c("EBS", "NBS")

# set.breaks <- akgfmaps::eval_plot_breaks(CPUE = dat$cpue_kgkm2, n.breaks = 5)
# set.breaks <- as.vector(unlist(set.breaks[set.breaks$style == "pretty", -1]))
set.breaks <- c(0, 50000, 100000, 150000, 200000, 250000)

figure_print <- ggplot() +
  # add map of alaska
  ggplot2::geom_sf(data = shps$akland,
                   color = NA,
                   fill = "grey50") +
  # add IDW plots
  geom_sf(data = idw$extrapolation.stack,
          mapping = aes(fill = var1.pred),
          na.rm = FALSE,
          show.legend = TRUE, 
          color = NA) +
  ggplot2::scale_fill_manual(
    name = "walleye pollock\nCPUE (kg/km2)",
    values =  c("gray90",
                viridis::viridis(
                  option = "mako",
                  direction = -1,
                  n = length(set.breaks)-1,
                  begin = 0.20,
                  end = 0.80)),
    na.translate = FALSE, # Don't use NA
    drop = FALSE) + 
  # seperate plots by year
  ggplot2::facet_wrap(facets = vars(year), nrow = 2) + 
  # add survey area
  ggplot2::geom_sf(
    data = shps$survey.area, 
    mapping = aes(color = SURVEY, 
                  geometry = geometry), 
    fill = "transparent", 
    linewidth = 1, 
    show.legend = FALSE) +
  ggplot2::scale_color_manual(
    name = " ", 
    values = c("grey30", "grey50"),
    breaks = shps$survey.area$SURVEY,
    labels = shps$survey.area$SRVY) + 
  # lat/lon axis and map bounds
  ggplot2::scale_x_continuous(name = "Longitude °W",
                              breaks = seq(-180, -150, 5)) +
  ggplot2::scale_y_continuous(name = "Latitude °N",
                              breaks = seq(50, 65, 5)) + # seq(52, 62, 2)
  ggplot2::coord_sf(xlim = sf::st_bbox(shps$survey.area)[c(1,3)],
                    ylim = sf::st_bbox(shps$survey.area)[c(2,4)]) +
  # add theme aesthetics
  ggplot2::guides(
    fill = guide_legend(
      order = 1,
      title.position = "top",
      label.position = "bottom",
      title.hjust = 0.5,
      override.aes = list(color = NA),
      nrow = 1),
    color = "none") +
  ggplot2::theme( 
    panel.background = element_rect(fill = "white", colour = NA), 
    panel.border = element_rect(fill = NA, colour = "grey20"), 
    strip.background = element_blank(), 
    strip.text = element_text(size = 10, face = "bold"), 
    legend.text = element_text(size = 9),
    legend.background = element_rect(colour = "transparent", 
                                     fill = "transparent"),
    legend.key = element_rect(colour = "transparent", 
                              fill = "transparent"),
    legend.position = "bottom", 
    legend.box = "horizontal",
    legend.box.spacing = unit(0, "pt"), # reduce space between legend & plot
    legend.margin=margin(0, 0, 0, 0) )

figure_print

12.4 Ex. Show catch data for 2023 eastern Bering Sea Walleye Pollock (one species in one survey region in one year)

Data downloads and joins for just one species, survey, and year are much faster and easier to do.

First, because year is identified in the haul table, we need to identify all of the hauls (or more specifically, hauljoin codes) that were completed in the eastern Bering Sea ("srvy":"EBS") in 2023 ("year":2023).

Note: Check how many rows and columns are in the data pull. The eastern Bering Sea survey (before 2024) has 376 stations in it, and pollock are often found in throughout the region so this should have a similar number of rows.

## query the API link
res <- httr::GET(url = paste0(api_link_haul, '?limit=10000&q={"year":2023,"srvy":"EBS"}'))

## convert from JSON format
data <- jsonlite::fromJSON(base::rawToChar(res$content)) 
dat <- data$items |>
  dplyr::select(-links) # necessary for API accounting, but not part of the dataset

## show summary of data to make sure it is subset correctly
summary(dat |> dplyr::mutate(srvy = as.factor(srvy)))

## Find how many rows and columns are in the data pull. 
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))

# save outputs for later comparison
dat_haul_ex <- dat
# Print the first few lines of the data 
dat_haul_ex |> 
  head(3) |> 
  flextable::flextable() |>
  flextable::colformat_num(
      j = c("year", "hauljoin", "cruise"), 
      big.mark = "") |> 
  flextable::theme_zebra()

12.4.1 Identify species_code for walleye pollock

In the catch data, we itemize species catches by species_code. To find out which species_code to use, you can check variations on the following code. Note that here the word pollock is case sensitive. All species common_name entries are lower case except for proper nouns (e.g., “Pacific”). The notation for finding a string is to use % around the phrase. Since % is a reserved character in a URL, you have to replace % with %25. Similarly, %20 needs to be used in place of a space (e.g., between “walleye” and “pollock”: "walleye%20pollock"}').

## query the API link. Use: 
res <- httr::GET(url = paste0(api_link_species, '?q={%22common_name%22:%22walleye%20pollock%22}'))
# OR
res <- httr::GET(url = paste0(api_link_species, '?q={"common_name":{"$like":"%25pollock%25"}}'))
# OR
res <- httr::GET(url = paste0(api_link_species, '?q={"common_name":"walleye%20pollock"}'))

## convert from JSON format
data <- jsonlite::fromJSON(base::rawToChar(res$content)) 

# save outputs for later comparison
dat_species_ex <- data$items |> dplyr::select(-links) # necessary for API accounting, but not part of the dataset
# Print the first few lines of the data
dat_species_ex |> 
  head(3) |> 
  flextable::flextable() |>
  flextable::colformat_num(
      j = c("species_code"), 
      big.mark = "") |> 
  flextable::theme_zebra()

12.4.2 Then, apply the hauljoins and species_code to catch query

We’ll use the data from the haul and species table we collected before to select 2023 eastern Bering Sea walleye pollock catch data.

## query the API link
# data for all walleye pollock caught in all 2023 eastern Bering Sea survey hauls
dat <- data.frame()
# there must be a better way to select multiple values for one parameter, 
# but saving that, we will loop through each hauljoin and collect the data of interest
for (i in 1:nrow(dat_haul_ex)) {
  res <- httr::GET(url = paste0(
    api_link_catch, 
    '?q={"species_code":21740,"hauljoin":', dat_haul_ex$hauljoin[i],'}'))
  ## convert from JSON format
  data <- jsonlite::fromJSON(base::rawToChar(res$content)) 
  if (length(data$items) != 0) {
    dat <- dplyr::bind_rows(
      dat,
      data$items |> 
        dplyr::select(-links)) # necessary for API accounting, but not part of the dataset
  }
}

Explore data:

# Find how many rows and columns are in the data pull
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))

# learn about the structure of the data
summary(dat)

# Print the first few lines of the data 
dat |> 
  head(3) |> 
  flextable::flextable() |>
  flextable::colformat_num(
      j = c("hauljoin", "species_code"), 
      big.mark = "") |> 
  flextable::theme_zebra()

# save outputs for later comparison
dat_catch_ex <- dat

For reference and to help break down the above query, see these other query examples:

# data for haul -22775 (i.e., one specific haul)?
res <- httr::GET(url = paste0(api_link_catch, 
                              '?offset=',i,'&limit=10000&q={"hauljoin":-22775}'))

# data for all walleye pollock (i.e., one species) caught in all years and surveys
res <- httr::GET(url = paste0(api_link_catch, 
                              '?offset=',i,'&limit=10000&q={"species_code":21740}'))

12.4.3 Create zero-filled data for 2023 eastern Bering Sea walleye pollock and plot

It is important to create and have access to zero-fill (presence and absence) so you can do simple analyses and plot data.

dat <- dplyr::full_join(
  dat_haul_ex,
  dat_catch_ex) |> 
  dplyr::full_join(
    dat_species_ex)  |> 
  # modify zero-filled rows
  dplyr::mutate(
    cpue_kgkm2 = ifelse(is.na(cpue_kgkm2), 0, cpue_kgkm2),
    cpue_nokm2 = ifelse(is.na(cpue_nokm2), 0, cpue_nokm2),
    count = ifelse(is.na(count), 0, count),
    weight_kg = ifelse(is.na(weight_kg), 0, weight_kg))

Explore data

# Find how many rows and columns are in the data pull
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))

# learn about the structure of the data
summary(dat)

# Print the first few lines of the data 
dat |> 
  head(3) |> 
  flextable::flextable() |>
  flextable::colformat_num(
      j = c("year", "cruise", "cruisejoin", "species_code"), 
      big.mark = "") |> 
  flextable::theme_zebra()

12.4.4 Visualize CPUE data in distribution map

Using the zero-filled data from the previous example, we can make a few plots!

12.5 Plot locations

library(ggplot2)

ggplot2::ggplot(data = dat |> dplyr::filter(cpue_kgkm2 != 0), 
                mapping = aes(x = longitude_dd_start, 
                              y = latitude_dd_start, 
                              size = cpue_kgkm2)) + 
  ggplot2::geom_point(alpha = .75) +
  ggplot2::geom_point(data = dat |> dplyr::filter(cpue_kgkm2 == 0), 
                      color = "red", 
                      shape = 17,
                      alpha = .75,
                      size = 3) +
  ggplot2::xlab("Longitude *W") +
  ggplot2::ylab("Latitude *N") +
  ggplot2::ggtitle(label = "Catches of walleye pollock (Weight CPUE; kg/km2)", 
                   subtitle = "2023 eastern Bering Sea bottom trawl survey") +
  ggplot2::scale_size_continuous(name = "Weight (kg)") + 
  ggplot2::theme_bw()

12.5.1 Plot inverse-distance weighted modeled product of locations

This map is constructed using akgfmaps

# pak::pak("afsc-gap-products/akgfmaps")
library(akgfmaps)

figure0 <- akgfmaps::make_idw_map(
  CPUE_KGHA = dat$cpue_kgkm2, # calculates the same, regardless of units.  
  LATITUDE = dat$latitude_dd_start, 
  LONGITUDE = dat$longitude_dd_start, 
  region = "bs.south", # Predefined EBS area
  set.breaks = "jenks", # Gets Jenks breaks from classint::classIntervals()
  in.crs = "+proj=longlat", # Set input coordinate reference system
  out.crs = "EPSG:3338", # Set output coordinate reference system
  extrapolation.grid.type = "sf")

figure0$plot + # 20x20km grid
  ggplot2::guides(fill=guide_legend(title = "walleye pollock\nCPUE (kg/km2)"))