Before the workshop

Please install the following packages:

#macrosheds
devtools::install_github('https://github.com/MacroSHEDS/macrosheds.git')

#tidyverse (dplyr, stringr, readr, etc.)
install.packages('tidyverse')

Then download the MacroSheds data we will use

library(macrosheds)

# this is where files will be downloaded. feel free to change it.
project_root <- '~/macrosheds_workshop'

# datasets we will use
ms_download_ws_attr(project_root, dataset = 'summaries')
ms_download_core_data(project_root, domains = c('niwot', 'plum', 'east_river', 'santee'))

These packages are optional (only used here and there, and the overall sequence doesn’t depend on them)

install.packages('xts')         # time series representation
install.packages('dygraphs')    # interactive plots
install.packages('viridis')     # colorblind-friendly color palettes
install.packages('factoextra')  # ordination, etc.
install.packages('mapview')     # interactive maps
install.packages('data.table')  # efficient computation
install.packages('whitebox')    # geospatial processing
whitebox::install_whitebox() #additional whitebox binaries. if this fails, use the next line
# whitebox::install_whitebox(platform = 'linux_musl')

MacroSheds

long-term watershed ecosystem studies, harmonized

Goals

enable calculation of input/output solute fluxes from diverse watersheds
lower barriers to accessing diverse watershed data through data harmonization and visualization
investigate:
- variation in magnitude, timing, form of N exports
- broad effects of watershed acidification
- variation in mineral weathering
- watershed sensitivity to climate change

Components

Terms

site: an individual gauging station or stream sampling location and its watershed
domain: one or more sites under common management
network: one or more domains under common funding/leadership
flux: solute mass normalized to watershed area, over time (kg/ha/d)

Part 1: Setup and overview

suppressPackageStartupMessages(library(tidyverse))
library(macrosheds)

  
  macrosheds v1.0.2: Up to date ✓
  
  This package is licensed under MIT, but licensing of MacroSheds data is more complex. See 
  https://docs.google.com/document/d/1CPaQ705QyoWfu6WjA4xHgQooCQ8arq3NZ8DO9tb9jZ0/edit?usp=sharing
  
  Complete metadata: https://portal.edirepository.org/nis/mapbrowse?scope=edi&identifier=1262

options(readr.show_progress = FALSE,   #prevent gratuitous printing from read_csv
        readr.show_col_types = FALSE,  #same
        timeout = 600)                 #allow more time for dataset downloads

project_root <- '~/macrosheds_workshop'

Retrieval of basic site data, watershed summary attributes

ms_sites <- ms_load_sites() %>% 
    filter(site_type == 'stream_gauge') #some precip gauges have the same names as stream gauges!

?ms_load_variables
ms_ts_variables <- ms_load_variables(var_set = 'timeseries')

?ms_download_ws_attr
# ms_download_ws_attr(project_root, dataset = 'summaries')

?ms_load_product
ws_smry <- ms_load_product(project_root, prodname = 'ws_attr_summaries')

watershed summary attribute definitions: [https://portal.edirepository.org/nis/metadataviewer?packageid=edi.1262.1] -> Data Entities -> ws_attr_summaries.csv
two-letter variable prefixes: variable_category_codes_ws_attr.csv, variable_data_source_codes_ws_attr.csv

Visualize distributions of MacroSheds watershed summary attributes

Principal component analysis of watershed summary attributes

suppressPackageStartupMessages(library(factoextra))

pca_data <- ws_smry %>% 
    select(where( ~sum(is.na(.)) / length(.) < 0.3)) %>% #drop columns with >= 30% missing values
    drop_na()                                            #drop rows with any missing values

domains <- pca_data$domain

pca_data <- pca_data %>% 
    select(-site_code, -domain, -network) %>% 
    select(where(~sd(., na.rm = TRUE) != 0)) %>%         #drop columns with no variance
    as.matrix()

smry_categories <- substr(colnames(pca_data), 1, 1)
category_map <- c('c' = 'climate', 'h' = 'hydrology', 'l' = 'landcover',
                  'p' = 'parentmat', 't' = 'terrain', 'v' = 'vegetation',
                  'w' = 'ws area')
smry_categories <- factor(category_map[match(smry_categories, names(category_map))])

pca <- prcomp(pca_data, center = TRUE, scale. = TRUE)

fviz_eig(pca)

fviz_pca_biplot(pca, geom.var = 'arrow', geom.ind = 'point', title = '',
                col.var = smry_categories, palette = 'Dark2')

fviz_pca_biplot(pca, geom.var = '', geom.ind = 'point', title = '',
                col.ind = as.factor(domains))

Identify domains at elevational extremes (2 of each)

elev_extreme_domains <- ws_smry %>% 
    filter(! is.na(te_elev_mean)) %>% #no watersheds delineated for McMurdo LTER
    group_by(domain) %>% 
    summarize(domain_mean_elev = mean(te_elev_mean)) %>% 
    ungroup() %>% 
    arrange(desc(domain_mean_elev)) %>% 
    slice(c(1:2, (n() - 1):n())) %>% #2 highest and 2 lowest domains, by average watershed elevation
    print() %>%
    pull(domain)

  # A tibble: 4 × 2
    domain     domain_mean_elev
    <chr>                 <dbl>
  1 niwot               3593.  
  2 east_river          3344.  
  3 plum                  31.7 
  4 santee                 9.10

Load site-variable catalog; filter by domain

sitevars <- ms_load_variables('timeseries_by_site')

sitevars <- filter(sitevars,
                   domain %in% elev_extreme_domains,
                   chem_category == 'stream_conc')

length(unique(sitevars$site_code)) # n sites

  [1] 43

What variables are shared by at least one site from each of these 4 domains?

get_shared_domain_vars <- function(df){

    df %>% 
        group_split(domain) %>% 
        map(~ pluck(.x, 'variable_code')) %>% 
        reduce(intersect)
}

get_shared_domain_vars(sitevars)

  [1] "Cl"    "DOC"   "PO4_P" "SO4_S" "TDN"

Again, but filter for variables recorded for >= 2 years, at >= average rate of 36 obs/year

sitevars <- sitevars %>% 
    mutate(ndays = difftime(last_record_utc, first_record_utc, unit = 'days')) %>% 
    filter(ndays >= 2 * 365.25,
           mean_obs_per_day * 365.25 >= 36)

get_shared_domain_vars(sitevars)

  [1] "Cl"    "DOC"   "PO4_P" "SO4_S" "TDN"

Retrieve core time-series data

?ms_download_core_data
# ms_download_core_data(project_root, domains = elev_extreme_domains)

?ms_load_product
(doc <- ms_load_product(
    project_root,
    prodname = 'stream_chemistry',
    filter_vars = 'DOC',
    domain = elev_extreme_domains,
    warn = FALSE
))

  # A tibble: 162,001 × 7
     datetime            site_code var      val ms_status ms_interp val_err
     <dttm>              <chr>     <chr>  <dbl>     <dbl>     <dbl>   <dbl>
   1 2015-11-09 00:00:00 Avery     GN_DOC 0.38          0         0  0.0001
   2 2015-11-10 00:00:00 Avery     GN_DOC 0.374         0         1  0.0001
   3 2015-11-11 00:00:00 Avery     GN_DOC 0.367         0         1  0.0001
   4 2015-11-12 00:00:00 Avery     GN_DOC 0.361         0         1  0.0001
   5 2015-11-13 00:00:00 Avery     GN_DOC 0.354         0         1  0.0001
   6 2015-11-14 00:00:00 Avery     GN_DOC 0.348         0         1  0.0001
   7 2015-11-15 00:00:00 Avery     GN_DOC 0.341         0         1  0.0001
   8 2015-11-16 00:00:00 Avery     GN_DOC 0.335         0         1  0.0001
   9 2015-11-17 00:00:00 Avery     GN_DOC 0.329         0         1  0.0001
  10 2015-11-18 00:00:00 Avery     GN_DOC 0.322         0         1  0.0001
  # ℹ 161,991 more rows

unique(doc$var)

  [1] "GN_DOC"

table(doc$ms_status)

  
       0      1 
  161930     71

table(doc$ms_interp)

  
       0      1 
  126140  35861

time-series column definitions: [https://portal.edirepository.org/nis/metadataviewer?packageid=edi.1262.1] -> Data Entities -> e.g. timeseries_boulder.csv
two-letter variable_code prefixes (ts only)
- G = grab sample
- I = installed instrument
- S = sensor
- N = non-sensor
variable units and full names are included in sitevars

Visualize stream discharge

library(viridis)

  Loading required package: viridisLite

plot_q <- function(ms_root, domain){

    q <- ms_load_product(ms_root, prodname = 'discharge', domains = domain)
    
    current_year <- as.numeric(strftime(Sys.Date(), format = '%Y'))
    earliest_year <- as.numeric(strftime(min(q$datetime), format = '%Y'))
    nyears <- current_year - earliest_year
    yrcols <- viridis(n = nyears)

    sites <- unique(q$site_code)

    plotrc <- ceiling(sqrt(length(sites)))
    doyseq <- seq(1, 366, 30)
    par(mfrow = c(plotrc, plotrc), mar = c(1, 2, 0, 0), oma = c(0, 0, 2, 0))

    for(s in sites){

        plot(NA, NA, xlim = c(1, 366), ylim = c(0, nyears), xaxs = 'i', yaxs = 'i',
             ylab = '', xlab = '', yaxt = 'n', cex.axis = 0.6, xaxt = 'n', xpd = NA)
        axis(1, doyseq, doyseq, tick = FALSE, line = -2, cex.axis = 0.8)
        axis(2, 1:nyears, earliest_year:(current_year - 1), las = 2, cex.axis = 0.6,
             hadj = 0.7)

        qsub <- q %>%
            filter(domain == domain, site_code == s) %>%
            mutate(doy = as.numeric(strftime(datetime, format = '%j', tz = 'UTC')),
                   yr_offset = lubridate::year(datetime) - earliest_year)

        lubridate::year(qsub$datetime) <- 1972
        yrs <- unique(qsub$yr_offset)

        for(i in 1:length(yrs)){
            qss <- qsub %>%
                filter(yr_offset == yrs[i]) %>%
                arrange(doy)
            lines(qss$doy, c(scale(qss$val)) + qss$yr_offset, col = yrcols[i])
        }

        mtext(s, 3, outer = FALSE, line = -2)
    }

    mtext(paste0(domain, ' (DOY vs. Year)'), 3, outer = TRUE)
}

plot_q(project_root, 'boulder')

# plot_q(project_root, 'fernow')
plot_q(project_root, 'baltimore')

# plot_q(project_root, 'hjandrews')

Visualize solute distributions

plot_conc <- function(ms_root, chemvar){
    
    unit <- ms_ts_variables %>% 
        filter(variable_code == !!chemvar,
               chem_category == 'stream_conc') %>%
        pull(unit)
    
    d <- ms_load_product(ms_root,
                         prodname = 'stream_chemistry',
                         filter_vars = chemvar,
                         warn = FALSE)
    
    ms_sites <- ms_load_sites() %>% 
        filter(site_type != 'precip_gauge')
    
    d <- ms_sites %>% 
        select(site_code, domain, network) %>% 
        right_join(d, by = 'site_code')
    
    domains <- na.omit(unique(d$domain))
    
    dmncolors <- c('brown4', 'brown1', 'blueviolet', 'blue4', 'dodgerblue3', 'blanchedalmond',
              'bisque4', 'khaki1', 'gray70', 'gray25', 'aquamarine4',
              'darkorange2', 'darkolivegreen4', 'darkolivegreen1', 'darkmagenta',
              'darkgoldenrod1', 'cyan3', 'deeppink',
              'darkred', 'green1', 'palevioletred4', 'peru', 'yellow', 'springgreen2',
              'mediumorchid3', 'white', 'skyblue', 'burlywood4', 'cornflowerblue')
    dmncolors <- dmncolors[1:length(domains)]
    
    log_ticks <- c(0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000)
    
    d_by_max <- d %>%
        filter(! is.na(val)) %>%
        group_by(network, domain, site_code) %>%
        summarize(maxval = max(val, na.rm = TRUE),
                  medianval = median(val, na.rm = TRUE),
                  n = n(),
                  .groups = 'drop') %>%
        mutate(ntw_dmn_sit = paste(network, domain, site_code,
                                   sep = ' > ')) %>%
        filter(n > 30)
    
    site_order <- d_by_max$ntw_dmn_sit[order(d_by_max$medianval)]
    
    sites_with_data <- d_by_max$site_code
    
    d_boxplot <- d %>%
        filter(site_code %in% !!sites_with_data) %>%
        group_by(network, domain, site_code) %>%
        summarize(box_stats = list(boxplot.stats(val)),
                  .groups = 'drop') %>%
        mutate(color = dmncolors[match(domain, domains)],
               ntw_dmn_sit = paste(network, domain, site_code,
                                   sep = ' > '))
    
    included_domain_inds <- which(domains %in% unique(d_by_max$domain))
    excluded_domains <- domains[-included_domain_inds]
    included_domains <- domains[included_domain_inds]
    included_dmncolors <- dmncolors[included_domain_inds]
    
    d_boxplot <- d_boxplot[rev(order(match(d_boxplot$ntw_dmn_sit, site_order))), ]
    
    nsites <- nrow(d_boxplot)
    ylims <- range(d$val, na.rm = TRUE)
    ylims[ylims == 0] <- 0.001
    ylims <- log(ylims)
    plot(1:nsites, rep(0, nsites), ylim = ylims, type = 'n', yaxt = 'n',
         ylab = paste0('log ', sub('_', '-', chemvar), ' (', unit, ')'),
         xlab = '', xaxt = 'n', yaxs = 'i',
         main = paste(chemvar, 'distribution across sites'), xlim = c(1, nsites))
    axis(2, at = log(log_ticks), labels = log_ticks)
    corners = par("usr")
    rect(corners[1], corners[3], corners[2], corners[4], col = 'black')
    
    for(j in 1:nsites){
        color <- slice(d_boxplot, j) %>% pull(color)
        stats <- (slice(d_boxplot, j) %>% pull(box_stats))[[1]]
        outliers <- log(stats$out)
        box_whisk <- log(stats$stats)
        segments(x0 = j, x1 = j, y0 = box_whisk[1], y1 = box_whisk[2],
                 col = color, lwd = 2, lend = 3)
        points(x = j, y = box_whisk[3], col = color, pch = 20)
        segments(x0 = j, x1 = j, y0 = box_whisk[4], y1 = box_whisk[5],
                 col = color, lwd = 2, lend = 3)
        points(x = rep(j, length(outliers)), y = outliers, col = color,
               pch = 20, cex = 0.2)
    }
    
    legend('topright', legend = included_domains, lty = 1,
           col = included_dmncolors, bty = 'n', lwd = 3,
           text.col = 'white', ncol = ceiling(length(included_domains) / 7))
    # newline_seq <- try(seq(6, length(excluded_domains), 6), silent=TRUE)
    # if(! inherits(newline_seq, 'try-error')){
    #     excluded_domains[newline_seq] = paste0('\n', excluded_domains[newline_seq])
    # }
    text(x = quantile(1:nsites, 0.15),
         y = quantile(ylims, 0.95), adj = 0, col = 'green',
         labels = paste0('Sites: ', length(unique(d_boxplot$ntw_dmn_sit))))
}

#note that Mike ran these with all domains downloaded
plot_conc(project_root, 'NO3_N') #nitrate-as-nitrogen

plot_conc(project_root, 'DOC')

plot_conc(project_root, 'SO4_S') #sulfate-as-sulfur

Part 2: Which variables best predict DOC at high and low elevation sites?

remove questionable observations; plot time series of interest

doc_wide <- doc %>% 
    filter(ms_status == 0) %>% #remove ms_status == 1 (questionable)
    select(datetime, site_code, val) %>% 
    left_join(select(ms_sites, site_code, domain), #join column of domain names
              by = 'site_code') %>% 
    filter(! is.na(domain)) %>%  #some site data is missing in MS version 1 (whoops)
    pivot_wider(names_from = c(domain, site_code),
                values_from = val,
                names_sep = '__') %>% #column names are of the form <domain>__<site_code>
    arrange(datetime) %>% #make sure it's all sorted chronologically
    complete(datetime = seq(first(datetime), last(datetime), by = 'day')) #explicate missing observations

suppressPackageStartupMessages(library(xts))
library(dygraphs)

dmn_colors <- factor(str_split_fixed(colnames(doc_wide)[-1], '__', n = 2)[, 1])
levels(dmn_colors) <- hcl.colors(length(elev_extreme_domains))

dg <- dygraph(xts(x = doc_wide[, -1], order.by = doc_wide$datetime)) %>% 
    dyRangeSelector()
for(i in 2:ncol(doc_wide)){
    dg <- dySeries(dg, colnames(doc_wide)[i], color = as.character(dmn_colors[i - 1]))
}
dg

Which watershed attributes predict temporal variation in DOC concentration

#compute CV for each site
#join ws attrs
#glmnet

Convert nitrate-N concentration to daily flux (basic mode)

NO3_N_conc <- ms_load_product(
    project_root,
    prodname = 'stream_chemistry',
    filter_vars = 'NO3_N',
    domain = 'niwot',
    warn = FALSE
)

Q <- ms_load_product(
    project_root,
    prodname = 'discharge',
    domain = 'niwot',
    warn = FALSE
)

NO3_N_conc <- semi_join(NO3_N_conc, Q, by = 'site_code')

#before
filter(NO3_N_conc,
       datetime == as.POSIXct('1985-05-09 00:00:00', tz = 'UTC'),
       site_code == 'ALBION')

  # A tibble: 1 × 7
    datetime            site_code var        val ms_status ms_interp val_err
    <dttm>              <chr>     <chr>    <dbl>     <dbl>     <dbl>   <dbl>
  1 1985-05-09 00:00:00 ALBION    GN_NO3_N 0.182         0         0    0.01

NO3_N_flux <- suppressPackageStartupMessages({
    ms_calc_flux(NO3_N_conc, Q, q_type = 'discharge')
})

  [1] "q dataset has a daily interval"

#after
filter(NO3_N_flux,
       datetime == as.POSIXct('1985-05-09 00:00:00', tz = 'UTC'),
       site_code == 'ALBION')

  # A tibble: 1 × 7
    datetime            site_code var          val ms_status ms_interp   val_err
    <dttm>              <chr>     <chr>      <dbl>     <dbl>     <dbl>     <dbl>
  1 1985-05-09 00:00:00 ALBION    GN_NO3_N 0.00113         0         0 0.0000621

Compute annual nitrate-N load (still in development)

# this function will be officially included in the macrosheds package when we
# release version 2 of the MacroSheds dataset, which will include monthly and
# yearly load estimates based on Gubbins et al. in prep. For now, you can call it
# with `:::`, which accesses "internal" package functions.
NO3_N_load <- macrosheds:::ms_calc_flux_rsfme(
    NO3_N_conc, Q,
    method = 'beale',
    aggregation = 'annual'
)

WRTDS load via EGRET package

ms_run_egret(stream_chemistry = filter(NO3_N_conc, site_code == 'ALBION'),
             discharge = filter(Q, site_code == 'ALBION'))

Basic method (just add up daily flux)

library(imputeTS)

  Registered S3 method overwritten by 'quantmod':
    method            from
    as.zoo.data.frame zoo

  
  Attaching package: 'imputeTS'

  The following object is masked from 'package:zoo':
  
      na.locf

siteyears_keep <- NO3_N_flux %>% 
    group_by(site_code, year = strftime(datetime, format = '%Y')) %>%
    summarize(n = length(na.omit(val))) %>% 
    filter(n > 365.25 * 0.5)

  `summarise()` has grouped output by 'site_code'. You can override using the
  `.groups` argument.

NO3_N_flux_interp <- NO3_N_flux %>% 
    filter(site_code == 'ALBION',
           datetime >= as.Date('1996-01-01'),
           datetime <= as.Date('2000-12-31')) %>% 
    arrange(datetime) %>% 
    complete(datetime = seq(first(datetime), last(datetime), by = 'day')) %>% 
    mutate(val = na_interpolation(val))

NO3_N_flux_interp %>% 
    group_by(year = strftime(datetime, format = '%Y')) %>% 
    summarize(load_kg_ha = sum(val))

  # A tibble: 5 × 2
    year  load_kg_ha
    <chr>      <dbl>
  1 1996       0.452
  2 1997       0.713
  3 1998       0.385
  4 1999       0.885
  5 2000       0.290

(Un)scale flux by watershed area

(kg_ha_d <- slice(NO3_N_flux, 1:3))

  # A tibble: 3 × 7
    datetime            site_code var          val ms_status ms_interp   val_err
    <dttm>              <chr>     <chr>      <dbl>     <dbl>     <dbl>     <dbl>
  1 1985-05-09 00:00:00 ALBION    GN_NO3_N 0.00113         0         0 0.0000621
  2 1985-05-10 00:00:00 ALBION    GN_NO3_N 0.00180         0         1 0.000107 
  3 1985-05-11 00:00:00 ALBION    GN_NO3_N 0.00101         0         1 0.0000654

(kg_d <- ms_undo_scale_flux_by_area(kg_ha_d))

  # A tibble: 3 × 7
    datetime            site_code var        val ms_status ms_interp   val_err
    <dttm>              <chr>     <chr>    <dbl>     <dbl>     <dbl>     <dbl>
  1 1985-05-09 00:00:00 ALBION    GN_NO3_N 0.813         0         0 0.0000621
  2 1985-05-10 00:00:00 ALBION    GN_NO3_N 1.29          0         1 0.000107 
  3 1985-05-11 00:00:00 ALBION    GN_NO3_N 0.724         0         1 0.0000654

(kg_ha_d <- ms_scale_flux_by_area(kg_d))

  # A tibble: 3 × 7
    datetime            site_code var          val ms_status ms_interp   val_err
    <dttm>              <chr>     <chr>      <dbl>     <dbl>     <dbl>     <dbl>
  1 1985-05-09 00:00:00 ALBION    GN_NO3_N 0.00113         0         0 0.0000621
  2 1985-05-10 00:00:00 ALBION    GN_NO3_N 0.00180         0         1 0.000107 
  3 1985-05-11 00:00:00 ALBION    GN_NO3_N 0.00101         0         1 0.0000654

#NOTE: not every function set up to perform error computations yet

Part 3: Useful tools

Automated citation/acknowledgement for any subset of the MacroSheds dataset

?ms_generate_attribution

attrib <- ms_generate_attribution(doc, chem_source = 'stream')

ls.str(attrib)

  acknowledgements :  chr "Primary data were provided by the following sources:\n1. East River SFA DOE (Funded by the Department of Energy"| __truncated__
  bibliography :  chr [1:120] "@article{vlah_etal_macrosheds_2023,\n\tauthor = {Vlah, Michael J. and Rhea, Spencer and Bernhardt, Emily S. and"| __truncated__ ...
  full_details_timeseries : tibble [101 × 27] (S3: tbl_df/tbl/data.frame)
  full_details_ws_attr : tibble [28 × 8] (S3: tbl_df/tbl/data.frame)
  intellectual_rights_explanations :  chr [1:4] "A share-alike license means derivative works must propagate the original license terms. If may_disregard_with_p"| __truncated__ ...
  intellectual_rights_notifications : List of 4
   $ sharealike_license      : tibble [1 × 5] (S3: tbl_df/tbl/data.frame)
   $ notify_of_intent_M      : tibble [1 × 4] (S3: tbl_df/tbl/data.frame)
   $ notify_on_distribution_M: tibble [1 × 4] (S3: tbl_df/tbl/data.frame)
   $ provide_access_M        : tibble [1 × 4] (S3: tbl_df/tbl/data.frame)

attrib$intellectual_rights_notifications

  $sharealike_license
  # A tibble: 1 × 5
    network domain macrosheds_prodname       may_disregard_with_permission contact
    <chr>   <chr>  <chr>                     <lgl>                         <chr>  
  1 <NA>    <NA>   tcw (watershed attribute) FALSE                         https:…
  
  $notify_of_intent_M
  # A tibble: 1 × 4
    network domain macrosheds_prodname contact       
    <chr>   <chr>  <chr>               <chr>         
  1 lter    plum   stream_chemistry    pie_im@mbl.edu
  
  $notify_on_distribution_M
  # A tibble: 1 × 4
    network domain macrosheds_prodname contact       
    <chr>   <chr>  <chr>               <chr>         
  1 lter    plum   stream_chemistry    pie_im@mbl.edu
  
  $provide_access_M
  # A tibble: 1 × 4
    network domain macrosheds_prodname contact       
    <chr>   <chr>  <chr>               <chr>         
  1 lter    plum   stream_chemistry    pie_im@mbl.edu

attrib$intellectual_rights_explanations

  [1] "A share-alike license means derivative works must propagate the original license terms. If may_disregard_with_permission is TRUE, you may ask the primary source contact for permission to use your own license."
  [2] "notify_of_intent_M means the primary source requires notice of any plans to publish derivative works that use their data."                                                                                       
  [3] "notify_on_distribution_M means the primary source requires that they be informed of any publications resulting from their data."                                                                                 
  [4] "provide_access_M means the primary source requires online access to any publications resulting from their data."

t(attrib$full_details_timeseries[1, ])

                                 [,1]                                                                                                                                                                                           
  domain                         "east_river"                                                                                                                                                                                   
  network                        "doe"                                                                                                                                                                                          
  macrosheds_prodcode            "VERSIONLESS008"                                                                                                                                                                               
  macrosheds_prodname            "ws_boundary"                                                                                                                                                                                  
  doi                            "10.21952/WTR/1508403"                                                                                                                                                                         
  data_status                    NA                                                                                                                                                                                             
  license                        "https://creativecommons.org/licenses/by/4.0/"                                                                                                                                                 
  license_type                   "Attribution"                                                                                                                                                                                  
  license_sharealike             NA                                                                                                                                                                                             
  IR_acknowledgement_text        NA                                                                                                                                                                                             
  IR_acknowledge_domain          NA                                                                                                                                                                                             
  IR_acknowledge_funding_sources NA                                                                                                                                                                                             
  IR_acknowledge_grant_numbers   NA                                                                                                                                                                                             
  IR_notify_of_intentions        NA                                                                                                                                                                                             
  IR_notify_on_distribution      NA                                                                                                                                                                                             
  IR_provide_online_access       NA                                                                                                                                                                                             
  IR_provide_two_reprints        NA                                                                                                                                                                                             
  IR_collaboration_consultation  NA                                                                                                                                                                                             
  IR_questions                   NA                                                                                                                                                                                             
  IR_needs_clarification         NA                                                                                                                                                                                             
  contact                        "rosemary.carroll@dri.edu"                                                                                                                                                                     
  contact_name1                  "Rosemary Carroll"                                                                                                                                                                             
  creator_name1                  "Rosemary Carroll"                                                                                                                                                                             
  funding                        "Funded by the Department of Energy"                                                                                                                                                           
  citation                       "Carroll, R., Bill, M., Dong, W., & Williams, K. (2019). Sub-Basin Delineation for the Upper East River, Colorado, United States. Watershed Function SFA. https://doi.org/10.21952/WTR/1508403"
  link                           "https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/packages/application%2Fbagit-097/ess-dive-3b9a58adc163fd4-20190509T155206477018"                                                               
  link_download_datetime         "2022-05-16 18:26:32 UTC"

attrib <- ms_generate_attribution(doc, chem_source = 'stream', write_to_dir = '~')

  Warning in dir.create(write_to_dir):
  '/home/mike/macrosheds_attribution_information' already exists

  Output files written to ~/macrosheds_attribution_information/

read_file('~/macrosheds_attribution_information/ms_bibliography.bib') %>% 
    substr(1, 1092) %>% 
    cat()

  @article{vlah_etal_macrosheds_2023,
    author = {Vlah, Michael J. and Rhea, Spencer and Bernhardt, Emily S. and Slaughter, Weston and Gubbins, Nick and DelVecchia, Amanda G. and Thellman, Audrey and Ross, Matthew R. V.},
    title = {MacroSheds: A synthesis of long-term biogeochemical, hydroclimatic, and geospatial data from small watershed ecosystem studies},
    journal = {Limnology and Oceanography Letters},
    volume = {8},
    number = {3},
    pages = {419-452},
    doi = {https://doi.org/10.1002/lol2.10325},
    url = {https://aslopubs.onlinelibrary.wiley.com/doi/abs/10.1002/lol2.10325},
    year = {2023},
  }
  
  @article{carroll_sub-basin_2019,
    title = {Sub-{Basin} {Delineation} for the {Upper} {East} {River}, {Colorado}, {United} {States}},
    doi = {10.21952/WTR/1508403},
    journal = {Watershed Function SFA},
    author = {Carroll, R. and Bill, M. and Dong, W. and Williams, K.},
    year = {2019},
  }
  
  @misc{usda_forest_service_southern_region_francis_2011,
    title = {Francis {Marion} \& {Sumter} {National} {Forests}: {SEF}\_Boundary},
    author = {{USDA Forest Service, Southern Region}},
    year = {2011},
  }

Watershed delineation

(interactive, so output not included in the HTML version of this document)

can work where StreamStats fails, especially in small watersheds

# whitebox::install_whitebox() #if this fails, use the next line
# whitebox::install_whitebox(platform = 'linux_musl')

tmp <- tempdir()

out <- ms_delineate_watershed(
    lat = 44.21013,
    long = -122.2571,
    crs = 4326,
    write_dir = tmp,
    write_name = 'example_site',
)

select <- dplyr::select #resolve namespace conflict introduced by raster package

str(out) #specifications of your successful delineation

Load spatial data

(watershed boundaries, precip gauge locations, stream gauge locations)

suppressPackageStartupMessages(library(mapview))

ws_bound <- ms_load_spatial_product(project_root, 'ws_boundary', domain = 'boulder')
prcp_gauges <- ms_load_spatial_product(project_root, 'stream_gauge_locations', domain = 'boulder')
strm_gauges <- ms_load_spatial_product(project_root, 'precip_gauge_locations', domain = 'boulder')

mapview(ws_bound) + mapview(prcp_gauges) + mapview(strm_gauges, col.regions = rainbow(n = 3))

More vignettes

Vignettes will only load if you installed macrosheds with build_vignettes=TRUE, but they’re also hosted as markdown files here. These provide tutorials on data retrieval, flux calculation, watershed delineation, and precip interpolation.

vignette(package = 'macrosheds')
vignette('ms_watershed_delineation', package = 'macrosheds')

?ms_synchronize_timestep  # upsample or downsample macrosheds data
?ms_calc_watershed_precip # spatial interpolation of precip gauge data
?ms_separate_baseflow     # baseflow/stormflow separation via hydrostats

Part 4: linking to the CAMELS datasets

Download CAMELS-US

(just watershed descriptors and streamflow time series)

camels_dir <- file.path(project_root, 'camels')
dir.create(camels_dir, showWarnings = FALSE)

attr_categories <- c('clim', 'geol', 'soil', 'topo', 'vege', 'hydro')

#watershed attributes
for(x in attr_categories){
    download.file(paste0('https://gdex.ucar.edu/dataset/camels/file/camels_', x, '.txt'),
                  destfile = paste0(camels_dir, '/', x, '.txt'))
}

# #Daymet forcings
# download.file('https://gdex.ucar.edu/dataset/camels/file/basin_timeseries_v1p2_modelOutput_daymet.zip',
#               destfile = file.path(camels_dir, 'daymet.zip'))

#observed flow
# download.file('https://gdex.ucar.edu/dataset/camels/file/basin_timeseries_v1p2_metForcing_obsFlow.zip',
#               destfile = file.path(camels_dir, 'flow.zip'))
# unzip(file.path(camels_dir, 'flow.zip')) #untested. be aware of the truncation issue described in ?unzip

Visualize CAMELS

view_attr_dists <- function(path_to_camels){

    char_cols <- c('dom_land_cover', 'geol_1st_class', 'geol_2nd_class',
                   'high_prec_timing', 'low_prec_timing')
    
    vege <- read_delim(file.path(path_to_camels, 'vege.txt'), delim = ';')
    topo <- read_delim(file.path(path_to_camels, 'topo.txt'), delim = ';')
    geol <- read_delim(file.path(path_to_camels, 'geol.txt'), delim = ';')
    soil <- read_delim(file.path(path_to_camels, 'soil.txt'), delim = ';')
    clim <- read_delim(file.path(path_to_camels, 'clim.txt'), delim = ';')
    hydro <- read_delim(file.path(path_to_camels, 'hydro.txt'), delim = ';')

    categories <- c(rep('vege', ncol(vege) - 1),
                    rep('topo', ncol(topo) - 1),
                    rep('geol', ncol(geol) - 1),
                    rep('soil', ncol(soil) - 1),
                    rep('clim', ncol(clim) - 1),
                    rep('hydr', ncol(hydro) - 1))
    
    d <- reduce(list(vege, topo, geol, soil, clim, hydro), full_join, by = 'gauge_id') %>% 
        # select(-matches(!!char_cols)) %>% 
        mutate(across(-c(gauge_id, !!char_cols), as.numeric))
        
    category_colors <- c('vege' = 'forestgreen',
                         'topo' = 'black',
                         'geol' = 'gray50',
                         'soil' = 'orange4',
                         'clim' = 'darkblue',
                         'hydr' = 'turquoise3')
    
    nrows <- ceiling(sqrt(length(categories)))
    par(mfrow = c(nrows, length(categories) / nrows),
        mar = c(1, 2, 0, 0), oma = rep(0.5, 4))
    
    plot(1, 1, type = 'n', axes = F)
    legend('left', legend = names(category_colors), col = unname(category_colors),
           lty = 1, lwd = 3, bty = 'n')
    legend('right', legend = c('CAMELS', 'MacroSheds'),
           fill = c('gray70', 'darkviolet'), bty = 'n', border = NA)
    
    for(i in 2:ncol(d)){
        
        varname <- colnames(d)[i]
        if(! varname %in% char_cols){
            
            dvar <- d %>% 
                select(gauge_id, !!varname) %>% 
                filter(! is.na(!!varname)) %>% 
                arrange(!!sym(varname))
            
            source_color <- if_else(grepl('^[0-9]{6,}$', dvar$gauge_id), 'gray70', 'darkviolet')
            color <- category_colors[categories[i - 1]]
            # color <- category_colors[names(category_colors) == categories[i - 1]]
            
            barplot(dvar[[varname]], col = source_color, border = source_color)
            box(col = color, lwd = 4)
            mtext(varname, 3, line = -1.5, col = 'black')
        }
    }
}

view_q_dists <- function(path_to_camels){
    
    flowdir <- file.path(path_to_camels, 'basin_dataset_public_v1p2/usgs_streamflow')
    flowfiles <- list.files(flowdir, recursive = TRUE, pattern = 'streamflow_qc\\.txt$',
                            full.names = TRUE)
    
    par(mfrow = c(1, 1), mar = c(2, 4, 1, 1))
    plot(as.Date(c('1930-01-01', '2023-07-30')), log(c(0.001, 1e6)), type = 'n',
         xlab = '', ylab = 'Log discharge (cfs)', yaxt = 'n')
    yaxis <- c(0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 1e5, 1e6)
    axis(2, log(yaxis), labels = as.character(yaxis))
    
    for(f in flowfiles){
        
        flow <- read.fwf(
            f,
            widths = c(9, 4, 3, 3, 9, 2),
            header = FALSE,
            strip.white = TRUE,
            colClasses = 'character',
            col.names = c('gauge_id', 'year', 'month', 'day', 'Q_cfs', 'flag')
        ) %>%
            as_tibble() %>%
            mutate(date = as.Date(paste(year, month, day), format = '%Y %m %d'),
                   Q_cfs = log(as.numeric(Q_cfs))) %>%
            mutate(Q_cfs = if_else(Q_cfs == -999, NA_real_, Q_cfs))
        
        lines(flow$date, flow$Q_cfs, col = adjustcolor('black', alpha.f = 0.3))
        
                   #      mm/d        cfs         m^3/ft^3      mm/m   s/d     m^2
                   # discharge = discharge * 0.028316846 * 1000 * 86400 / ws_area_m2)
    }
}

view_attr_dists(camels_dir)

Merge MacroSheds and CAMELS

ms_download_ws_attr(project_root, dataset = 'CAMELS summaries')

  Downloading dataset type: CAMELS summaries (1/1; download id 39944257)
  watershed_summaries_CAMELS.feather successfully downloaded to ~/macrosheds_workshop

ms_cmls <- ms_load_product(project_root, prodname = 'ws_attr_CAMELS_summaries') %>% 
    rename(geol_porostiy = geol_porosity, #accommodate typo in CAMELS dataset
           q5 = Q5, q95 = Q95, baseflow_index = baseflow_index_landson) #and some slight name differences

#proportion of missing values per column
apply(ms_cmls, MARGIN = 2, function(x) round(sum(! is.na(x)) / length(x), 2))

             site_code               domain              network 
                  1.00                 1.00                 1.00 
                p_mean             pet_mean              aridity 
                  0.93                 0.93                 0.93 
         p_seasonality            frac_snow       high_prec_freq 
                  0.93                 0.93                 0.93 
         high_prec_dur     high_prec_timing        low_prec_freq 
                  0.93                 0.93                 0.93 
          low_prec_dur      low_prec_timing       geol_1st_class 
                  0.93                 0.93                 0.00 
   glim_1st_class_frac       geol_2nd_class  glim_2nd_class_frac 
                  0.97                 0.00                 0.97 
  carbonate_rocks_frac        geol_porostiy    geol_permeability 
                  0.97                 0.91                 0.91 
             sand_frac            silt_frac            clay_frac 
                  0.76                 0.76                 0.76 
          organic_frac            gauge_lat            gauge_lon 
                  0.76                 1.00                 1.00 
                  area            elev_mean           slope_mean 
                  0.99                 0.99                 0.99 
           frac_forest  dom_land_cover_frac       dom_land_cover 
                  0.89                 0.89                 0.89 
         root_depth_50        root_depth_99               q_mean 
                  0.89                 0.89                 0.93 
          runoff_ratio          stream_elas            slope_fdc 
                  0.93                 0.93                 0.87 
        baseflow_index             hfd_mean                   q5 
                  0.93                 0.93                 0.93 
                   q95          high_q_freq           high_q_dur 
                  0.93                 0.93                 0.93 
            low_q_freq            low_q_dur          zero_q_freq 
                  0.93                 0.93                 0.93

camels_ms_dir <- file.path(project_root, 'camels_macrosheds_merge')
dir.create(camels_ms_dir, showWarnings = FALSE)

cmls_topo <- read_delim(file.path(camels_dir, 'topo.txt'), delim = ';') %>% 
    rename(area = area_gages2) %>% #could also use area_geospa_fabric here
    mutate(across(everything(), as.character))
    
ms_cmls %>% 
    select(gauge_id = site_code, matches(colnames(cmls_topo)), 'area') %>%
    mutate(across(everything(), as.character)) %>% 
    bind_rows(cmls_topo) %>% 
    write_delim(file.path(camels_ms_dir, 'topo.txt'), delim = ';')

camels_ms_merge <- function(dataset, read_dir, write_dir){
    
    #dataset: character; one of "vege", "topo", "soil", "geol", "hydro", "clim"
    
    cmls_d <- read_delim(paste0(read_dir, '/', dataset, '.txt'), delim = ';') %>% 
        mutate(across(everything(), as.character))
    
    missing_vars <- setdiff(colnames(cmls_d), colnames(ms_cmls)) %>% 
        str_subset('gauge_id', negate = TRUE)
    
    if(length(missing_vars)){
        print(paste('MacroSheds is missing some CAMELS variables:',
                    paste(missing_vars, collapse = ', ')))
    }
    
    ms_cmls %>% 
        select(gauge_id = site_code, matches(colnames(cmls_d))) %>%
        mutate(across(everything(), as.character)) %>% 
        bind_rows(cmls_d) %>% 
        write_delim(paste0(write_dir, '/', dataset, '.txt'), delim = ';')
}

camels_ms_merge('vege', read_dir = camels_dir, write_dir = camels_ms_dir)

  [1] "MacroSheds is missing some CAMELS variables: lai_max, lai_diff, gvf_max, gvf_diff"

camels_ms_merge('geol', read_dir = camels_dir, write_dir = camels_ms_dir)
camels_ms_merge('soil', read_dir = camels_dir, write_dir = camels_ms_dir)

  [1] "MacroSheds is missing some CAMELS variables: soil_depth_pelletier, soil_depth_statsgo, soil_porosity, soil_conductivity, max_water_content, water_frac, other_frac"

camels_ms_merge('clim', read_dir = camels_dir, write_dir = camels_ms_dir)
camels_ms_merge('hydro', read_dir = camels_dir, write_dir = camels_ms_dir)

Visualize again

view_attr_dists(camels_ms_dir)

Macroscale watershed science

Hacking Limnology 2023

Before the workshop

MacroSheds

Goals

Components

Terms

Part 1: Setup and overview

Retrieval of basic site data, watershed summary attributes

Visualize distributions of MacroSheds watershed summary attributes

Principal component analysis of watershed summary attributes

Identify domains at elevational extremes (2 of each)

Load site-variable catalog; filter by domain

What variables are shared by at least one site from each of these 4 domains?

Again, but filter for variables recorded for >= 2 years, at >= average rate of 36 obs/year

Retrieve core time-series data

Visualize stream discharge

Visualize solute distributions

Part 2: Which variables best predict DOC at high and low elevation sites?

remove questionable observations; plot time series of interest

Which watershed attributes predict temporal variation in DOC concentration

Convert nitrate-N concentration to daily flux (basic mode)

Compute annual nitrate-N load (still in development)

WRTDS load via EGRET package

Basic method (just add up daily flux)

(Un)scale flux by watershed area

Part 3: Useful tools

Automated citation/acknowledgement for any subset of the MacroSheds dataset

Watershed delineation

Load spatial data

More vignettes

Part 4: linking to the CAMELS datasets

Download CAMELS-US

Visualize CAMELS

Merge MacroSheds and CAMELS

Visualize again

Macroscale watershed science

Hacking Limnology 2023

Before the workshop

MacroSheds

Goals

Components

Terms

Related resources

Part 1: Setup and overview

Retrieval of basic site data, watershed summary attributes

Visualize distributions of MacroSheds watershed summary attributes

Principal component analysis of watershed summary attributes

Identify domains at elevational extremes (2 of each)

Load site-variable catalog; filter by domain

What variables are shared by at least one site from each of these 4 domains?

Again, but filter for variables recorded for >= 2 years, at >= average rate of 36 obs/year

Retrieve core time-series data

Visualize stream discharge

Visualize solute distributions

Part 2: Which variables best predict DOC at high and low elevation sites?

remove questionable observations; plot time series of interest

Which watershed attributes predict temporal variation in DOC concentration

Convert nitrate-N concentration to daily flux (basic mode)

Compute annual nitrate-N load (still in development)

WRTDS load via EGRET package

Basic method (just add up daily flux)

(Un)scale flux by watershed area

Part 3: Useful tools

Automated citation/acknowledgement for any subset of the MacroSheds dataset

Watershed delineation

Load spatial data

More vignettes

Part 4: linking to the CAMELS datasets

Download CAMELS-US

Visualize CAMELS

Merge MacroSheds and CAMELS

Visualize again