# link to the API
api_link_haul <- 'https://apps-st.fisheries.noaa.gov/ods/foss/afsc_groundfish_survey_haul/'Access via API and R
An application programming interface (API) is a way for two or more computer programs to communicate with each other. More information about how to amend API links can be found here. Useful introductions to using APIs in R can be found here.
There are three tables the user can pull from the API. Learn more about them on the FOSS data description page. Here, you can see them in their raw JSON format:
- haul: https://apps-st.fisheries.noaa.gov/ods/foss/afsc_groundfish_survey_haul/
- catch: https://apps-st.fisheries.noaa.gov/ods/foss/afsc_groundfish_survey_catch/
- species: https://apps-st.fisheries.noaa.gov/ods/foss/afsc_groundfish_survey_species/
Here are some examples of how to use the data with R:
12.1 Ex. Load all rows of the catch, haul, and species data tables
Note that without specifying, a basic query to the API will only return 25 entries.
12.1.1 Load haul data
12.1.1.1 Load first 25 rows of data
res <- httr::GET(url = api_link_haul)
# res ## Test connection
## convert from JSON format
dat <- jsonlite::fromJSON(base::rawToChar(res$content))$items
# Find how many rows and columns are in the data pull
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))12.1.1.2 Load all data:
Since the maxim number of rows a user can pull is 10,000 rows in a query, the user needs to cycle through by offsetting to the next 10,000 rows (as is shown here).
dat <- data.frame()
for (i in seq(0, 500000, 10000)){
## find how many iterations it takes to cycle through the data
print(i)
## query the API link
res <- httr::GET(url = paste0(api_link_haul, "?offset=",i,"&limit=10000"))
## convert from JSON format
data <- jsonlite::fromJSON(base::rawToChar(res$content))
## if there are no data, stop the loop
if (is.null(nrow(data$items))) {
break
}
## bind sub-pull to dat data.frame
dat <- dplyr::bind_rows(dat,
data$items |>
dplyr::select(-links)) # necessary for API accounting, but not part of the dataset)
}Explore the data contents:
# Find how many rows and columns are in the data pull
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))
# learn about the structure of the data
summary(dat)
# Print the first few lines of the data
dat |>
head(3) |>
flextable::flextable() |>
flextable::colformat_num(
j = c("year", "cruise", "cruisejoin"),
big.mark = "") |>
flextable::theme_zebra()
# save outputs for later comparison
dat_haul_api <- dat12.1.2 Load catch data
# link to the API
api_link_catch <- 'https://apps-st.fisheries.noaa.gov/ods/foss/afsc_groundfish_survey_catch/'12.1.2.1 Load first 25 rows of data
res <- httr::GET(url = api_link_catch)
# res ## Test connection
## convert from JSON format
dat <- jsonlite::fromJSON(base::rawToChar(res$content))$items
# Find how many rows and columns are in the data pull
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))12.1.2.2 Load all data
Since the maxim number of rows a user can pull is 10,000 rows in a query, the user needs to cycle through by offsetting to the next 10,000 rows (as is shown here).
dat <- data.frame()
# for (i in seq(0, 100000, 10000)){
for (i in seq(0, 1000000, 10000)){
## find how many iterations it takes to cycle through the data
# print(i)
## query the API link
res <- httr::GET(url = paste0(api_link_catch, "?offset=",i,"&limit=10000"))
## convert from JSON format
data <- jsonlite::fromJSON(base::rawToChar(res$content))
## if there are no data, stop the loop
if (is.null(nrow(data$items))) {
break
}
## bind sub-pull to dat data.frame
dat <- dplyr::bind_rows(dat,
data$items |>
dplyr::select(-links)) # necessary for API accounting, but not part of the dataset)
}Explore the data contents:
# Find how many rows and columns are in the data pull
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))
# learn about the structure of the data
summary(dat)
# Print the first few lines of the data
dat |>
head(3) |>
flextable::flextable() |>
flextable::colformat_num(
j = c("species_code"),
big.mark = "") |>
flextable::theme_zebra()
# save outputs for later comparison
dat_catch_api <- dat12.1.3 Load species data
Since there are less than 10,000 rows of species data (and the maxim number of rows a user can pull from this API is 10,000 rows in a query), we can simply call ?offset=0&limit=10000 in our query call.
# link to the API
api_link_species <- 'https://apps-st.fisheries.noaa.gov/ods/foss/afsc_groundfish_survey_species/'res <- httr::GET(url = paste0(api_link_species, "?offset=0&limit=10000"))
## convert from JSON format
data <- jsonlite::fromJSON(base::rawToChar(res$content))
dat <- data$items |>
dplyr::select(-links) # necessary for API accounting, but not part of the datasetExplore the data contents:
# Find how many rows and columns are in the data pull
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))
# learn about the structure of the data
summary(dat)
# Print the first few lines of the data
dat |>
head(3) |>
flextable::flextable() |>
flextable::colformat_num(
j = c("species_code", "worms", "itis"), #
big.mark = "") |>
flextable::theme_zebra()
# save outputs for later comparison
dat_species_api <- dat12.2 Ex. Create zero-filled data using data loaded in last example
It is important to create and have access to zero-fill (presence and absence) so you can do simple analyses and plot data.
First prepare a table with all combinations of what species should be listed for what hauls/surveys. For zero-filled data, all species caught in a survey need to have zero or non-zero row entries for a haul
comb <- dplyr::full_join(
# find all species that have been caught, by survey
x = dplyr::left_join(dat_catch_api, dat_haul_api, by = "hauljoin") |>
dplyr::select(survey_definition_id, species_code) |>
dplyr::distinct(),
# find all haul events (hauljoins), by survey
y = dat_haul_api |>
dplyr::select(survey_definition_id, hauljoin) |>
dplyr::distinct(),
relationship = "many-to-many",
by = "survey_definition_id"
) |>
dplyr::select(-survey_definition_id) # now, redundantExplore the data contents:
print(paste0("rows: ", nrow(comb), "; cols: ", ncol(comb)))
comb |> head(3) |>
flextable::flextable() |>
flextable::colformat_num(
j = c("species_code", "hauljoin"),
big.mark = "") |>
flextable::theme_zebra()Now, using that table of combinations (here, called comb), join data to make a full zero-filled CPUE dataset. When all of the data have been full joined together, there should be the maximum number of rows in comb.
dat <- comb |>
# add species data
dplyr::left_join(dat_species_api) |> # , "species_code"
# add haul data
dplyr::left_join(dat_haul_api) |> # , c("hauljoin")
# add catch data
dplyr::left_join(dat_catch_api) |> # , c("species_code", "hauljoin")
# modify/clean up zero-filled rows
dplyr::mutate(
cpue_kgkm2 = ifelse(is.na(cpue_kgkm2), 0, cpue_kgkm2),
cpue_nokm2 = ifelse(is.na(cpue_nokm2), 0, cpue_nokm2),
count = ifelse(is.na(count), 0, count),
weight_kg = ifelse(is.na(weight_kg), 0, weight_kg))Explore the data contents:
# Find how many rows and columns are in the data pull
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))
# learn about the structure of the data
summary(dat)
# Print the first few lines of the data
dat |>
head(3) |>
flextable::flextable() |>
flextable::colformat_num(
j = c("species_code", "hauljoin", "year", "cruise", "cruisejoin", "worms", "itis"), #
big.mark = "") |>
flextable::theme_zebra()
# save outputs for later comparison
dat_zerofill_api <- dat12.3 Ex. Visualize zero-filled data for 2023 eastern Bering Sea walleye pollock in CPUE data in distribution map
Using the zero-filled data from the previous example, we can make a few plots!
Here is some example data of 2023 through 2019 (year %in% 2019:2023) eastern and northern Bering Sea (srvy %in% c("EBS", "NBS)) walleye pollock (species_code == 21740).
dat <- dat_zerofill_api |>
dplyr::filter(year %in% 2019:2023 &
srvy %in% c("EBS", "NBS") &
species_code == 21740) |>
dplyr::select(year, common_name, longitude_dd_start, latitude_dd_start, cpue_kgkm2)
# Find how many rows and columns are in the data pull
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))
# # learn about the structure of the data
# summary(dat)
# Print the first few lines of the data
dat |>
head(3) |>
flextable::flextable() |>
flextable::colformat_num(
j = c("year"),
big.mark = "") |>
flextable::theme_zebra()12.3.1 Plot locations on map
library(ggplot2)
ggplot2::ggplot(data = dat |> dplyr::filter(cpue_kgkm2 != 0),
mapping = aes(x = longitude_dd_start,
y = latitude_dd_start,
size = cpue_kgkm2)) +
ggplot2::geom_point(alpha = .75) +
ggplot2::geom_point(data = dat |> dplyr::filter(cpue_kgkm2 == 0),
color = "red",
shape = 17,
alpha = .75,
size = 3) +
ggplot2::xlab("Longitude *W") +
ggplot2::ylab("Latitude *N") +
ggplot2::ggtitle(label = "CPUE (kg/km^2) of walleye pollock (Weight CPUE; kg/km2)",
subtitle = "Eastern Bering Sea bottom trawl survey") +
ggplot2::scale_size_continuous(name = "Weight (kg)") +
ggplot2::facet_wrap(facets = vars(year)) +
ggplot2::theme_bw()12.3.2 Plot inverse-distance weighted plot of CPUE
This map is constructed using akgfmaps. To make IDW plots, you must have data from all stations surveyed, even if no fish of interest were found there.
These plots are similar to those published in the annual Bering Sea data reports.
# pak::pak("afsc-gap-products/akgfmaps")
library(akgfmaps)
idw <- akgfmaps::make_idw_stack(
x = dat |>
dplyr::select(COMMON_NAME = common_name,
CPUE_KGHA = cpue_kgkm2,
LATITUDE = latitude_dd_start,
LONGITUDE = longitude_dd_start,
year),
grouping.vars = "year",
region = "bs.all", # Predefined EBS area
set.breaks = "jenks", # Gets Jenks breaks from classint::classIntervals()
in.crs = "+proj=longlat", # Set input coordinate reference system
out.crs = "EPSG:3338", # Set output coordinate reference system
extrapolation.grid.type = "sf")
shps <- akgfmaps::get_base_layers(
select.region = "bs.all",
# include.corners = TRUE,
set.crs = "EPSG:3338")
shps$survey.area$SRVY <- c("EBS", "NBS")
shps$survey.area$SURVEY <- c("EBS", "NBS")
# set.breaks <- akgfmaps::eval_plot_breaks(CPUE = dat$cpue_kgkm2, n.breaks = 5)
# set.breaks <- as.vector(unlist(set.breaks[set.breaks$style == "pretty", -1]))
set.breaks <- c(0, 50000, 100000, 150000, 200000, 250000)
figure_print <- ggplot() +
# add map of alaska
ggplot2::geom_sf(data = shps$akland,
color = NA,
fill = "grey50") +
# add IDW plots
geom_sf(data = idw$extrapolation.stack,
mapping = aes(fill = var1.pred),
na.rm = FALSE,
show.legend = TRUE,
color = NA) +
ggplot2::scale_fill_manual(
name = "walleye pollock\nCPUE (kg/km2)",
values = c("gray90",
viridis::viridis(
option = "mako",
direction = -1,
n = length(set.breaks)-1,
begin = 0.20,
end = 0.80)),
na.translate = FALSE, # Don't use NA
drop = FALSE) +
# seperate plots by year
ggplot2::facet_wrap(facets = vars(year), nrow = 2) +
# add survey area
ggplot2::geom_sf(
data = shps$survey.area,
mapping = aes(color = SURVEY,
geometry = geometry),
fill = "transparent",
linewidth = 1,
show.legend = FALSE) +
ggplot2::scale_color_manual(
name = " ",
values = c("grey30", "grey50"),
breaks = shps$survey.area$SURVEY,
labels = shps$survey.area$SRVY) +
# lat/lon axis and map bounds
ggplot2::scale_x_continuous(name = "Longitude °W",
breaks = seq(-180, -150, 5)) +
ggplot2::scale_y_continuous(name = "Latitude °N",
breaks = seq(50, 65, 5)) + # seq(52, 62, 2)
ggplot2::coord_sf(xlim = sf::st_bbox(shps$survey.area)[c(1,3)],
ylim = sf::st_bbox(shps$survey.area)[c(2,4)]) +
# add theme aesthetics
ggplot2::guides(
fill = guide_legend(
order = 1,
title.position = "top",
label.position = "bottom",
title.hjust = 0.5,
override.aes = list(color = NA),
nrow = 1),
color = "none") +
ggplot2::theme(
panel.background = element_rect(fill = "white", colour = NA),
panel.border = element_rect(fill = NA, colour = "grey20"),
strip.background = element_blank(),
strip.text = element_text(size = 10, face = "bold"),
legend.text = element_text(size = 9),
legend.background = element_rect(colour = "transparent",
fill = "transparent"),
legend.key = element_rect(colour = "transparent",
fill = "transparent"),
legend.position = "bottom",
legend.box = "horizontal",
legend.box.spacing = unit(0, "pt"), # reduce space between legend & plot
legend.margin=margin(0, 0, 0, 0) )
figure_print12.4 Ex. Show catch data for 2023 eastern Bering Sea Walleye Pollock (one species in one survey region in one year)
Data downloads and joins for just one species, survey, and year are much faster and easier to do.
First, because year is identified in the haul table, we need to identify all of the hauls (or more specifically, hauljoin codes) that were completed in the eastern Bering Sea ("srvy":"EBS") in 2023 ("year":2023).
Note: Check how many rows and columns are in the data pull. The eastern Bering Sea survey (before 2024) has 376 stations in it, and pollock are often found in throughout the region so this should have a similar number of rows.
## query the API link
res <- httr::GET(url = paste0(api_link_haul, '?limit=10000&q={"year":2023,"srvy":"EBS"}'))
## convert from JSON format
data <- jsonlite::fromJSON(base::rawToChar(res$content))
dat <- data$items |>
dplyr::select(-links) # necessary for API accounting, but not part of the dataset
## show summary of data to make sure it is subset correctly
summary(dat |> dplyr::mutate(srvy = as.factor(srvy)))
## Find how many rows and columns are in the data pull.
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))
# save outputs for later comparison
dat_haul_ex <- dat# Print the first few lines of the data
dat_haul_ex |>
head(3) |>
flextable::flextable() |>
flextable::colformat_num(
j = c("year", "hauljoin", "cruise"),
big.mark = "") |>
flextable::theme_zebra()12.4.1 Identify species_code for walleye pollock
In the catch data, we itemize species catches by species_code. To find out which species_code to use, you can check variations on the following code. Note that here the word pollock is case sensitive. All species common_name entries are lower case except for proper nouns (e.g., “Pacific”). The notation for finding a string is to use % around the phrase. Since % is a reserved character in a URL, you have to replace % with %25. Similarly, %20 needs to be used in place of a space (e.g., between “walleye” and “pollock”: "walleye%20pollock"}').
## query the API link. Use:
res <- httr::GET(url = paste0(api_link_species, '?q={%22common_name%22:%22walleye%20pollock%22}'))
# OR
res <- httr::GET(url = paste0(api_link_species, '?q={"common_name":{"$like":"%25pollock%25"}}'))
# OR
res <- httr::GET(url = paste0(api_link_species, '?q={"common_name":"walleye%20pollock"}'))
## convert from JSON format
data <- jsonlite::fromJSON(base::rawToChar(res$content))
# save outputs for later comparison
dat_species_ex <- data$items |> dplyr::select(-links) # necessary for API accounting, but not part of the dataset# Print the first few lines of the data
dat_species_ex |>
head(3) |>
flextable::flextable() |>
flextable::colformat_num(
j = c("species_code"),
big.mark = "") |>
flextable::theme_zebra()12.4.2 Then, apply the hauljoins and species_code to catch query
We’ll use the data from the haul and species table we collected before to select 2023 eastern Bering Sea walleye pollock catch data.
## query the API link
# data for all walleye pollock caught in all 2023 eastern Bering Sea survey hauls
dat <- data.frame()
# there must be a better way to select multiple values for one parameter,
# but saving that, we will loop through each hauljoin and collect the data of interest
for (i in 1:nrow(dat_haul_ex)) {
res <- httr::GET(url = paste0(
api_link_catch,
'?q={"species_code":21740,"hauljoin":', dat_haul_ex$hauljoin[i],'}'))
## convert from JSON format
data <- jsonlite::fromJSON(base::rawToChar(res$content))
if (length(data$items) != 0) {
dat <- dplyr::bind_rows(
dat,
data$items |>
dplyr::select(-links)) # necessary for API accounting, but not part of the dataset
}
}Explore data:
# Find how many rows and columns are in the data pull
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))
# learn about the structure of the data
summary(dat)
# Print the first few lines of the data
dat |>
head(3) |>
flextable::flextable() |>
flextable::colformat_num(
j = c("hauljoin", "species_code"),
big.mark = "") |>
flextable::theme_zebra()
# save outputs for later comparison
dat_catch_ex <- datFor reference and to help break down the above query, see these other query examples:
# data for haul -22775 (i.e., one specific haul)?
res <- httr::GET(url = paste0(api_link_catch,
'?offset=',i,'&limit=10000&q={"hauljoin":-22775}'))
# data for all walleye pollock (i.e., one species) caught in all years and surveys
res <- httr::GET(url = paste0(api_link_catch,
'?offset=',i,'&limit=10000&q={"species_code":21740}'))12.4.3 Create zero-filled data for 2023 eastern Bering Sea walleye pollock and plot
It is important to create and have access to zero-fill (presence and absence) so you can do simple analyses and plot data.
dat <- dplyr::full_join(
dat_haul_ex,
dat_catch_ex) |>
dplyr::full_join(
dat_species_ex) |>
# modify zero-filled rows
dplyr::mutate(
cpue_kgkm2 = ifelse(is.na(cpue_kgkm2), 0, cpue_kgkm2),
cpue_nokm2 = ifelse(is.na(cpue_nokm2), 0, cpue_nokm2),
count = ifelse(is.na(count), 0, count),
weight_kg = ifelse(is.na(weight_kg), 0, weight_kg))Explore data
# Find how many rows and columns are in the data pull
print(paste0("rows: ", nrow(dat), "; cols: ", ncol(dat)))
# learn about the structure of the data
summary(dat)
# Print the first few lines of the data
dat |>
head(3) |>
flextable::flextable() |>
flextable::colformat_num(
j = c("year", "cruise", "cruisejoin", "species_code"),
big.mark = "") |>
flextable::theme_zebra()12.4.4 Visualize CPUE data in distribution map
Using the zero-filled data from the previous example, we can make a few plots!
12.5 Plot locations
library(ggplot2)
ggplot2::ggplot(data = dat |> dplyr::filter(cpue_kgkm2 != 0),
mapping = aes(x = longitude_dd_start,
y = latitude_dd_start,
size = cpue_kgkm2)) +
ggplot2::geom_point(alpha = .75) +
ggplot2::geom_point(data = dat |> dplyr::filter(cpue_kgkm2 == 0),
color = "red",
shape = 17,
alpha = .75,
size = 3) +
ggplot2::xlab("Longitude *W") +
ggplot2::ylab("Latitude *N") +
ggplot2::ggtitle(label = "Catches of walleye pollock (Weight CPUE; kg/km2)",
subtitle = "2023 eastern Bering Sea bottom trawl survey") +
ggplot2::scale_size_continuous(name = "Weight (kg)") +
ggplot2::theme_bw()12.5.1 Plot inverse-distance weighted modeled product of locations
This map is constructed using akgfmaps
# pak::pak("afsc-gap-products/akgfmaps")
library(akgfmaps)
figure0 <- akgfmaps::make_idw_map(
CPUE_KGHA = dat$cpue_kgkm2, # calculates the same, regardless of units.
LATITUDE = dat$latitude_dd_start,
LONGITUDE = dat$longitude_dd_start,
region = "bs.south", # Predefined EBS area
set.breaks = "jenks", # Gets Jenks breaks from classint::classIntervals()
in.crs = "+proj=longlat", # Set input coordinate reference system
out.crs = "EPSG:3338", # Set output coordinate reference system
extrapolation.grid.type = "sf")
figure0$plot + # 20x20km grid
ggplot2::guides(fill=guide_legend(title = "walleye pollock\nCPUE (kg/km2)"))