Before the workshop

Please install the following packages:

#macrosheds
devtools::install_github('https://github.com/MacroSHEDS/macrosheds.git')

#tidyverse (dplyr, stringr, readr, etc.)
install.packages('tidyverse')

Then download the MacroSheds data we will use

library(macrosheds)

# this is where files will be downloaded. feel free to change it.
project_root <- '~/macrosheds_workshop'

# datasets we will use
ms_download_ws_attr(project_root, dataset = 'summaries')
ms_download_core_data(project_root, domains = c('niwot', 'plum', 'east_river', 'santee'))

These packages are optional (only used here and there, and the overall sequence doesn’t depend on them)

install.packages('xts')         # time series representation
install.packages('dygraphs')    # interactive plots
install.packages('viridis')     # colorblind-friendly color palettes
install.packages('factoextra')  # ordination, etc.
install.packages('mapview')     # interactive maps
install.packages('data.table')  # efficient computation
install.packages('whitebox')    # geospatial processing
whitebox::install_whitebox() #additional whitebox binaries. if this fails, use the next line
# whitebox::install_whitebox(platform = 'linux_musl')

MacroSheds

long-term watershed ecosystem studies, harmonized

Goals

  • enable calculation of input/output solute fluxes from diverse watersheds
  • lower barriers to accessing diverse watershed data through data harmonization and visualization
  • investigate:
    • variation in magnitude, timing, form of N exports
    • broad effects of watershed acidification
    • variation in mineral weathering
    • watershed sensitivity to climate change

Terms

  • site: an individual gauging station or stream sampling location and its watershed
  • domain: one or more sites under common management
  • network: one or more domains under common funding/leadership
  • flux: solute mass normalized to watershed area, over time (kg/ha/d)

Part 1: Setup and overview

suppressPackageStartupMessages(library(tidyverse))
library(macrosheds)
  
  macrosheds v1.0.2: Up to date ✓
  
  This package is licensed under MIT, but licensing of MacroSheds data is more complex. See 
  https://docs.google.com/document/d/1CPaQ705QyoWfu6WjA4xHgQooCQ8arq3NZ8DO9tb9jZ0/edit?usp=sharing
  
  Complete metadata: https://portal.edirepository.org/nis/mapbrowse?scope=edi&identifier=1262
options(readr.show_progress = FALSE,   #prevent gratuitous printing from read_csv
        readr.show_col_types = FALSE,  #same
        timeout = 600)                 #allow more time for dataset downloads

project_root <- '~/macrosheds_workshop'

Retrieval of basic site data, watershed summary attributes

ms_sites <- ms_load_sites() %>% 
    filter(site_type == 'stream_gauge') #some precip gauges have the same names as stream gauges!

?ms_load_variables
ms_ts_variables <- ms_load_variables(var_set = 'timeseries')

?ms_download_ws_attr
# ms_download_ws_attr(project_root, dataset = 'summaries')

?ms_load_product
ws_smry <- ms_load_product(project_root, prodname = 'ws_attr_summaries')

Visualize distributions of MacroSheds watershed summary attributes

Principal component analysis of watershed summary attributes

suppressPackageStartupMessages(library(factoextra))

pca_data <- ws_smry %>% 
    select(where( ~sum(is.na(.)) / length(.) < 0.3)) %>% #drop columns with >= 30% missing values
    drop_na()                                            #drop rows with any missing values

domains <- pca_data$domain

pca_data <- pca_data %>% 
    select(-site_code, -domain, -network) %>% 
    select(where(~sd(., na.rm = TRUE) != 0)) %>%         #drop columns with no variance
    as.matrix()

smry_categories <- substr(colnames(pca_data), 1, 1)
category_map <- c('c' = 'climate', 'h' = 'hydrology', 'l' = 'landcover',
                  'p' = 'parentmat', 't' = 'terrain', 'v' = 'vegetation',
                  'w' = 'ws area')
smry_categories <- factor(category_map[match(smry_categories, names(category_map))])

pca <- prcomp(pca_data, center = TRUE, scale. = TRUE)

fviz_eig(pca)

fviz_pca_biplot(pca, geom.var = 'arrow', geom.ind = 'point', title = '',
                col.var = smry_categories, palette = 'Dark2')

fviz_pca_biplot(pca, geom.var = '', geom.ind = 'point', title = '',
                col.ind = as.factor(domains))

Identify domains at elevational extremes (2 of each)

elev_extreme_domains <- ws_smry %>% 
    filter(! is.na(te_elev_mean)) %>% #no watersheds delineated for McMurdo LTER
    group_by(domain) %>% 
    summarize(domain_mean_elev = mean(te_elev_mean)) %>% 
    ungroup() %>% 
    arrange(desc(domain_mean_elev)) %>% 
    slice(c(1:2, (n() - 1):n())) %>% #2 highest and 2 lowest domains, by average watershed elevation
    print() %>%
    pull(domain)
  # A tibble: 4 × 2
    domain     domain_mean_elev
    <chr>                 <dbl>
  1 niwot               3593.  
  2 east_river          3344.  
  3 plum                  31.7 
  4 santee                 9.10

Load site-variable catalog; filter by domain

sitevars <- ms_load_variables('timeseries_by_site')

sitevars <- filter(sitevars,
                   domain %in% elev_extreme_domains,
                   chem_category == 'stream_conc')

length(unique(sitevars$site_code)) # n sites
  [1] 43

What variables are shared by at least one site from each of these 4 domains?

get_shared_domain_vars <- function(df){

    df %>% 
        group_split(domain) %>% 
        map(~ pluck(.x, 'variable_code')) %>% 
        reduce(intersect)
}

get_shared_domain_vars(sitevars)
  [1] "Cl"    "DOC"   "PO4_P" "SO4_S" "TDN"

Again, but filter for variables recorded for >= 2 years, at >= average rate of 36 obs/year

sitevars <- sitevars %>% 
    mutate(ndays = difftime(last_record_utc, first_record_utc, unit = 'days')) %>% 
    filter(ndays >= 2 * 365.25,
           mean_obs_per_day * 365.25 >= 36)

get_shared_domain_vars(sitevars)
  [1] "Cl"    "DOC"   "PO4_P" "SO4_S" "TDN"

Retrieve core time-series data

?ms_download_core_data
# ms_download_core_data(project_root, domains = elev_extreme_domains)

?ms_load_product
(doc <- ms_load_product(
    project_root,
    prodname = 'stream_chemistry',
    filter_vars = 'DOC',
    domain = elev_extreme_domains,
    warn = FALSE
))
  # A tibble: 162,001 × 7
     datetime            site_code var      val ms_status ms_interp val_err
     <dttm>              <chr>     <chr>  <dbl>     <dbl>     <dbl>   <dbl>
   1 2015-11-09 00:00:00 Avery     GN_DOC 0.38          0         0  0.0001
   2 2015-11-10 00:00:00 Avery     GN_DOC 0.374         0         1  0.0001
   3 2015-11-11 00:00:00 Avery     GN_DOC 0.367         0         1  0.0001
   4 2015-11-12 00:00:00 Avery     GN_DOC 0.361         0         1  0.0001
   5 2015-11-13 00:00:00 Avery     GN_DOC 0.354         0         1  0.0001
   6 2015-11-14 00:00:00 Avery     GN_DOC 0.348         0         1  0.0001
   7 2015-11-15 00:00:00 Avery     GN_DOC 0.341         0         1  0.0001
   8 2015-11-16 00:00:00 Avery     GN_DOC 0.335         0         1  0.0001
   9 2015-11-17 00:00:00 Avery     GN_DOC 0.329         0         1  0.0001
  10 2015-11-18 00:00:00 Avery     GN_DOC 0.322         0         1  0.0001
  # ℹ 161,991 more rows
unique(doc$var)
  [1] "GN_DOC"
table(doc$ms_status)
  
       0      1 
  161930     71
table(doc$ms_interp)
  
       0      1 
  126140  35861

Visualize stream discharge

library(viridis)
  Loading required package: viridisLite
plot_q <- function(ms_root, domain){

    q <- ms_load_product(ms_root, prodname = 'discharge', domains = domain)
    
    current_year <- as.numeric(strftime(Sys.Date(), format = '%Y'))
    earliest_year <- as.numeric(strftime(min(q$datetime), format = '%Y'))
    nyears <- current_year - earliest_year
    yrcols <- viridis(n = nyears)

    sites <- unique(q$site_code)

    plotrc <- ceiling(sqrt(length(sites)))
    doyseq <- seq(1, 366, 30)
    par(mfrow = c(plotrc, plotrc), mar = c(1, 2, 0, 0), oma = c(0, 0, 2, 0))

    for(s in sites){

        plot(NA, NA, xlim = c(1, 366), ylim = c(0, nyears), xaxs = 'i', yaxs = 'i',
             ylab = '', xlab = '', yaxt = 'n', cex.axis = 0.6, xaxt = 'n', xpd = NA)
        axis(1, doyseq, doyseq, tick = FALSE, line = -2, cex.axis = 0.8)
        axis(2, 1:nyears, earliest_year:(current_year - 1), las = 2, cex.axis = 0.6,
             hadj = 0.7)

        qsub <- q %>%
            filter(domain == domain, site_code == s) %>%
            mutate(doy = as.numeric(strftime(datetime, format = '%j', tz = 'UTC')),
                   yr_offset = lubridate::year(datetime) - earliest_year)

        lubridate::year(qsub$datetime) <- 1972
        yrs <- unique(qsub$yr_offset)

        for(i in 1:length(yrs)){
            qss <- qsub %>%
                filter(yr_offset == yrs[i]) %>%
                arrange(doy)
            lines(qss$doy, c(scale(qss$val)) + qss$yr_offset, col = yrcols[i])
        }

        mtext(s, 3, outer = FALSE, line = -2)
    }

    mtext(paste0(domain, ' (DOY vs. Year)'), 3, outer = TRUE)
}

plot_q(project_root, 'boulder')

# plot_q(project_root, 'fernow')
plot_q(project_root, 'baltimore')

# plot_q(project_root, 'hjandrews')

Visualize solute distributions

plot_conc <- function(ms_root, chemvar){
    
    unit <- ms_ts_variables %>% 
        filter(variable_code == !!chemvar,
               chem_category == 'stream_conc') %>%
        pull(unit)
    
    d <- ms_load_product(ms_root,
                         prodname = 'stream_chemistry',
                         filter_vars = chemvar,
                         warn = FALSE)
    
    ms_sites <- ms_load_sites() %>% 
        filter(site_type != 'precip_gauge')
    
    d <- ms_sites %>% 
        select(site_code, domain, network) %>% 
        right_join(d, by = 'site_code')
    
    domains <- na.omit(unique(d$domain))
    
    dmncolors <- c('brown4', 'brown1', 'blueviolet', 'blue4', 'dodgerblue3', 'blanchedalmond',
              'bisque4', 'khaki1', 'gray70', 'gray25', 'aquamarine4',
              'darkorange2', 'darkolivegreen4', 'darkolivegreen1', 'darkmagenta',
              'darkgoldenrod1', 'cyan3', 'deeppink',
              'darkred', 'green1', 'palevioletred4', 'peru', 'yellow', 'springgreen2',
              'mediumorchid3', 'white', 'skyblue', 'burlywood4', 'cornflowerblue')
    dmncolors <- dmncolors[1:length(domains)]
    
    log_ticks <- c(0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000)
    
    d_by_max <- d %>%
        filter(! is.na(val)) %>%
        group_by(network, domain, site_code) %>%
        summarize(maxval = max(val, na.rm = TRUE),
                  medianval = median(val, na.rm = TRUE),
                  n = n(),
                  .groups = 'drop') %>%
        mutate(ntw_dmn_sit = paste(network, domain, site_code,
                                   sep = ' > ')) %>%
        filter(n > 30)
    
    site_order <- d_by_max$ntw_dmn_sit[order(d_by_max$medianval)]
    
    sites_with_data <- d_by_max$site_code
    
    d_boxplot <- d %>%
        filter(site_code %in% !!sites_with_data) %>%
        group_by(network, domain, site_code) %>%
        summarize(box_stats = list(boxplot.stats(val)),
                  .groups = 'drop') %>%
        mutate(color = dmncolors[match(domain, domains)],
               ntw_dmn_sit = paste(network, domain, site_code,
                                   sep = ' > '))
    
    included_domain_inds <- which(domains %in% unique(d_by_max$domain))
    excluded_domains <- domains[-included_domain_inds]
    included_domains <- domains[included_domain_inds]
    included_dmncolors <- dmncolors[included_domain_inds]
    
    d_boxplot <- d_boxplot[rev(order(match(d_boxplot$ntw_dmn_sit, site_order))), ]
    
    nsites <- nrow(d_boxplot)
    ylims <- range(d$val, na.rm = TRUE)
    ylims[ylims == 0] <- 0.001
    ylims <- log(ylims)
    plot(1:nsites, rep(0, nsites), ylim = ylims, type = 'n', yaxt = 'n',
         ylab = paste0('log ', sub('_', '-', chemvar), ' (', unit, ')'),
         xlab = '', xaxt = 'n', yaxs = 'i',
         main = paste(chemvar, 'distribution across sites'), xlim = c(1, nsites))
    axis(2, at = log(log_ticks), labels = log_ticks)
    corners = par("usr")
    rect(corners[1], corners[3], corners[2], corners[4], col = 'black')
    
    for(j in 1:nsites){
        color <- slice(d_boxplot, j) %>% pull(color)
        stats <- (slice(d_boxplot, j) %>% pull(box_stats))[[1]]
        outliers <- log(stats$out)
        box_whisk <- log(stats$stats)
        segments(x0 = j, x1 = j, y0 = box_whisk[1], y1 = box_whisk[2],
                 col = color, lwd = 2, lend = 3)
        points(x = j, y = box_whisk[3], col = color, pch = 20)
        segments(x0 = j, x1 = j, y0 = box_whisk[4], y1 = box_whisk[5],
                 col = color, lwd = 2, lend = 3)
        points(x = rep(j, length(outliers)), y = outliers, col = color,
               pch = 20, cex = 0.2)
    }
    
    legend('topright', legend = included_domains, lty = 1,
           col = included_dmncolors, bty = 'n', lwd = 3,
           text.col = 'white', ncol = ceiling(length(included_domains) / 7))
    # newline_seq <- try(seq(6, length(excluded_domains), 6), silent=TRUE)
    # if(! inherits(newline_seq, 'try-error')){
    #     excluded_domains[newline_seq] = paste0('\n', excluded_domains[newline_seq])
    # }
    text(x = quantile(1:nsites, 0.15),
         y = quantile(ylims, 0.95), adj = 0, col = 'green',
         labels = paste0('Sites: ', length(unique(d_boxplot$ntw_dmn_sit))))
}

#note that Mike ran these with all domains downloaded
plot_conc(project_root, 'NO3_N') #nitrate-as-nitrogen

plot_conc(project_root, 'DOC')

plot_conc(project_root, 'SO4_S') #sulfate-as-sulfur

Part 2: Which variables best predict DOC at high and low elevation sites?

remove questionable observations; plot time series of interest

doc_wide <- doc %>% 
    filter(ms_status == 0) %>% #remove ms_status == 1 (questionable)
    select(datetime, site_code, val) %>% 
    left_join(select(ms_sites, site_code, domain), #join column of domain names
              by = 'site_code') %>% 
    filter(! is.na(domain)) %>%  #some site data is missing in MS version 1 (whoops)
    pivot_wider(names_from = c(domain, site_code),
                values_from = val,
                names_sep = '__') %>% #column names are of the form <domain>__<site_code>
    arrange(datetime) %>% #make sure it's all sorted chronologically
    complete(datetime = seq(first(datetime), last(datetime), by = 'day')) #explicate missing observations

suppressPackageStartupMessages(library(xts))
library(dygraphs)

dmn_colors <- factor(str_split_fixed(colnames(doc_wide)[-1], '__', n = 2)[, 1])
levels(dmn_colors) <- hcl.colors(length(elev_extreme_domains))

dg <- dygraph(xts(x = doc_wide[, -1], order.by = doc_wide$datetime)) %>% 
    dyRangeSelector()
for(i in 2:ncol(doc_wide)){
    dg <- dySeries(dg, colnames(doc_wide)[i], color = as.character(dmn_colors[i - 1]))
}
dg

Which watershed attributes predict temporal variation in DOC concentration

#compute CV for each site
#join ws attrs
#glmnet

Convert nitrate-N concentration to daily flux (basic mode)

NO3_N_conc <- ms_load_product(
    project_root,
    prodname = 'stream_chemistry',
    filter_vars = 'NO3_N',
    domain = 'niwot',
    warn = FALSE
)

Q <- ms_load_product(
    project_root,
    prodname = 'discharge',
    domain = 'niwot',
    warn = FALSE
)

NO3_N_conc <- semi_join(NO3_N_conc, Q, by = 'site_code')

#before
filter(NO3_N_conc,
       datetime == as.POSIXct('1985-05-09 00:00:00', tz = 'UTC'),
       site_code == 'ALBION')
  # A tibble: 1 × 7
    datetime            site_code var        val ms_status ms_interp val_err
    <dttm>              <chr>     <chr>    <dbl>     <dbl>     <dbl>   <dbl>
  1 1985-05-09 00:00:00 ALBION    GN_NO3_N 0.182         0         0    0.01
NO3_N_flux <- suppressPackageStartupMessages({
    ms_calc_flux(NO3_N_conc, Q, q_type = 'discharge')
})
  [1] "q dataset has a daily interval"
#after
filter(NO3_N_flux,
       datetime == as.POSIXct('1985-05-09 00:00:00', tz = 'UTC'),
       site_code == 'ALBION')
  # A tibble: 1 × 7
    datetime            site_code var          val ms_status ms_interp   val_err
    <dttm>              <chr>     <chr>      <dbl>     <dbl>     <dbl>     <dbl>
  1 1985-05-09 00:00:00 ALBION    GN_NO3_N 0.00113         0         0 0.0000621

Compute annual nitrate-N load (still in development)

# this function will be officially included in the macrosheds package when we
# release version 2 of the MacroSheds dataset, which will include monthly and
# yearly load estimates based on Gubbins et al. in prep. For now, you can call it
# with `:::`, which accesses "internal" package functions.
NO3_N_load <- macrosheds:::ms_calc_flux_rsfme(
    NO3_N_conc, Q,
    method = 'beale',
    aggregation = 'annual'
)

WRTDS load via EGRET package

ms_run_egret(stream_chemistry = filter(NO3_N_conc, site_code == 'ALBION'),
             discharge = filter(Q, site_code == 'ALBION'))

Basic method (just add up daily flux)

library(imputeTS)
  Registered S3 method overwritten by 'quantmod':
    method            from
    as.zoo.data.frame zoo
  
  Attaching package: 'imputeTS'
  The following object is masked from 'package:zoo':
  
      na.locf
siteyears_keep <- NO3_N_flux %>% 
    group_by(site_code, year = strftime(datetime, format = '%Y')) %>%
    summarize(n = length(na.omit(val))) %>% 
    filter(n > 365.25 * 0.5)
  `summarise()` has grouped output by 'site_code'. You can override using the
  `.groups` argument.
NO3_N_flux_interp <- NO3_N_flux %>% 
    filter(site_code == 'ALBION',
           datetime >= as.Date('1996-01-01'),
           datetime <= as.Date('2000-12-31')) %>% 
    arrange(datetime) %>% 
    complete(datetime = seq(first(datetime), last(datetime), by = 'day')) %>% 
    mutate(val = na_interpolation(val))

NO3_N_flux_interp %>% 
    group_by(year = strftime(datetime, format = '%Y')) %>% 
    summarize(load_kg_ha = sum(val))
  # A tibble: 5 × 2
    year  load_kg_ha
    <chr>      <dbl>
  1 1996       0.452
  2 1997       0.713
  3 1998       0.385
  4 1999       0.885
  5 2000       0.290

(Un)scale flux by watershed area

(kg_ha_d <- slice(NO3_N_flux, 1:3))
  # A tibble: 3 × 7
    datetime            site_code var          val ms_status ms_interp   val_err
    <dttm>              <chr>     <chr>      <dbl>     <dbl>     <dbl>     <dbl>
  1 1985-05-09 00:00:00 ALBION    GN_NO3_N 0.00113         0         0 0.0000621
  2 1985-05-10 00:00:00 ALBION    GN_NO3_N 0.00180         0         1 0.000107 
  3 1985-05-11 00:00:00 ALBION    GN_NO3_N 0.00101         0         1 0.0000654
(kg_d <- ms_undo_scale_flux_by_area(kg_ha_d))
  # A tibble: 3 × 7
    datetime            site_code var        val ms_status ms_interp   val_err
    <dttm>              <chr>     <chr>    <dbl>     <dbl>     <dbl>     <dbl>
  1 1985-05-09 00:00:00 ALBION    GN_NO3_N 0.813         0         0 0.0000621
  2 1985-05-10 00:00:00 ALBION    GN_NO3_N 1.29          0         1 0.000107 
  3 1985-05-11 00:00:00 ALBION    GN_NO3_N 0.724         0         1 0.0000654
(kg_ha_d <- ms_scale_flux_by_area(kg_d))
  # A tibble: 3 × 7
    datetime            site_code var          val ms_status ms_interp   val_err
    <dttm>              <chr>     <chr>      <dbl>     <dbl>     <dbl>     <dbl>
  1 1985-05-09 00:00:00 ALBION    GN_NO3_N 0.00113         0         0 0.0000621
  2 1985-05-10 00:00:00 ALBION    GN_NO3_N 0.00180         0         1 0.000107 
  3 1985-05-11 00:00:00 ALBION    GN_NO3_N 0.00101         0         1 0.0000654
#NOTE: not every function set up to perform error computations yet

Part 3: Useful tools

Automated citation/acknowledgement for any subset of the MacroSheds dataset

?ms_generate_attribution

attrib <- ms_generate_attribution(doc, chem_source = 'stream')

ls.str(attrib)
  acknowledgements :  chr "Primary data were provided by the following sources:\n1. East River SFA DOE (Funded by the Department of Energy"| __truncated__
  bibliography :  chr [1:120] "@article{vlah_etal_macrosheds_2023,\n\tauthor = {Vlah, Michael J. and Rhea, Spencer and Bernhardt, Emily S. and"| __truncated__ ...
  full_details_timeseries : tibble [101 × 27] (S3: tbl_df/tbl/data.frame)
  full_details_ws_attr : tibble [28 × 8] (S3: tbl_df/tbl/data.frame)
  intellectual_rights_explanations :  chr [1:4] "A share-alike license means derivative works must propagate the original license terms. If may_disregard_with_p"| __truncated__ ...
  intellectual_rights_notifications : List of 4
   $ sharealike_license      : tibble [1 × 5] (S3: tbl_df/tbl/data.frame)
   $ notify_of_intent_M      : tibble [1 × 4] (S3: tbl_df/tbl/data.frame)
   $ notify_on_distribution_M: tibble [1 × 4] (S3: tbl_df/tbl/data.frame)
   $ provide_access_M        : tibble [1 × 4] (S3: tbl_df/tbl/data.frame)
attrib$intellectual_rights_notifications
  $sharealike_license
  # A tibble: 1 × 5
    network domain macrosheds_prodname       may_disregard_with_permission contact
    <chr>   <chr>  <chr>                     <lgl>                         <chr>  
  1 <NA>    <NA>   tcw (watershed attribute) FALSE                         https:…
  
  $notify_of_intent_M
  # A tibble: 1 × 4
    network domain macrosheds_prodname contact       
    <chr>   <chr>  <chr>               <chr>         
  1 lter    plum   stream_chemistry    pie_im@mbl.edu
  
  $notify_on_distribution_M
  # A tibble: 1 × 4
    network domain macrosheds_prodname contact       
    <chr>   <chr>  <chr>               <chr>         
  1 lter    plum   stream_chemistry    pie_im@mbl.edu
  
  $provide_access_M
  # A tibble: 1 × 4
    network domain macrosheds_prodname contact       
    <chr>   <chr>  <chr>               <chr>         
  1 lter    plum   stream_chemistry    pie_im@mbl.edu
attrib$intellectual_rights_explanations
  [1] "A share-alike license means derivative works must propagate the original license terms. If may_disregard_with_permission is TRUE, you may ask the primary source contact for permission to use your own license."
  [2] "notify_of_intent_M means the primary source requires notice of any plans to publish derivative works that use their data."                                                                                       
  [3] "notify_on_distribution_M means the primary source requires that they be informed of any publications resulting from their data."                                                                                 
  [4] "provide_access_M means the primary source requires online access to any publications resulting from their data."
t(attrib$full_details_timeseries[1, ])
                                 [,1]                                                                                                                                                                                           
  domain                         "east_river"                                                                                                                                                                                   
  network                        "doe"                                                                                                                                                                                          
  macrosheds_prodcode            "VERSIONLESS008"                                                                                                                                                                               
  macrosheds_prodname            "ws_boundary"                                                                                                                                                                                  
  doi                            "10.21952/WTR/1508403"                                                                                                                                                                         
  data_status                    NA                                                                                                                                                                                             
  license                        "https://creativecommons.org/licenses/by/4.0/"                                                                                                                                                 
  license_type                   "Attribution"                                                                                                                                                                                  
  license_sharealike             NA                                                                                                                                                                                             
  IR_acknowledgement_text        NA                                                                                                                                                                                             
  IR_acknowledge_domain          NA                                                                                                                                                                                             
  IR_acknowledge_funding_sources NA                                                                                                                                                                                             
  IR_acknowledge_grant_numbers   NA                                                                                                                                                                                             
  IR_notify_of_intentions        NA                                                                                                                                                                                             
  IR_notify_on_distribution      NA                                                                                                                                                                                             
  IR_provide_online_access       NA                                                                                                                                                                                             
  IR_provide_two_reprints        NA                                                                                                                                                                                             
  IR_collaboration_consultation  NA                                                                                                                                                                                             
  IR_questions                   NA                                                                                                                                                                                             
  IR_needs_clarification         NA                                                                                                                                                                                             
  contact                        "rosemary.carroll@dri.edu"                                                                                                                                                                     
  contact_name1                  "Rosemary Carroll"                                                                                                                                                                             
  creator_name1                  "Rosemary Carroll"                                                                                                                                                                             
  funding                        "Funded by the Department of Energy"                                                                                                                                                           
  citation                       "Carroll, R., Bill, M., Dong, W., & Williams, K. (2019). Sub-Basin Delineation for the Upper East River, Colorado, United States. Watershed Function SFA. https://doi.org/10.21952/WTR/1508403"
  link                           "https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/packages/application%2Fbagit-097/ess-dive-3b9a58adc163fd4-20190509T155206477018"                                                               
  link_download_datetime         "2022-05-16 18:26:32 UTC"
attrib <- ms_generate_attribution(doc, chem_source = 'stream', write_to_dir = '~')
  Warning in dir.create(write_to_dir):
  '/home/mike/macrosheds_attribution_information' already exists
  Output files written to ~/macrosheds_attribution_information/
read_file('~/macrosheds_attribution_information/ms_bibliography.bib') %>% 
    substr(1, 1092) %>% 
    cat()
  @article{vlah_etal_macrosheds_2023,
    author = {Vlah, Michael J. and Rhea, Spencer and Bernhardt, Emily S. and Slaughter, Weston and Gubbins, Nick and DelVecchia, Amanda G. and Thellman, Audrey and Ross, Matthew R. V.},
    title = {MacroSheds: A synthesis of long-term biogeochemical, hydroclimatic, and geospatial data from small watershed ecosystem studies},
    journal = {Limnology and Oceanography Letters},
    volume = {8},
    number = {3},
    pages = {419-452},
    doi = {https://doi.org/10.1002/lol2.10325},
    url = {https://aslopubs.onlinelibrary.wiley.com/doi/abs/10.1002/lol2.10325},
    year = {2023},
  }
  
  @article{carroll_sub-basin_2019,
    title = {Sub-{Basin} {Delineation} for the {Upper} {East} {River}, {Colorado}, {United} {States}},
    doi = {10.21952/WTR/1508403},
    journal = {Watershed Function SFA},
    author = {Carroll, R. and Bill, M. and Dong, W. and Williams, K.},
    year = {2019},
  }
  
  @misc{usda_forest_service_southern_region_francis_2011,
    title = {Francis {Marion} \& {Sumter} {National} {Forests}: {SEF}\_Boundary},
    author = {{USDA Forest Service, Southern Region}},
    year = {2011},
  }

Watershed delineation

(interactive, so output not included in the HTML version of this document)

can work where StreamStats fails, especially in small watersheds

# whitebox::install_whitebox() #if this fails, use the next line
# whitebox::install_whitebox(platform = 'linux_musl')

tmp <- tempdir()

out <- ms_delineate_watershed(
    lat = 44.21013,
    long = -122.2571,
    crs = 4326,
    write_dir = tmp,
    write_name = 'example_site',
)

select <- dplyr::select #resolve namespace conflict introduced by raster package

str(out) #specifications of your successful delineation

Load spatial data

(watershed boundaries, precip gauge locations, stream gauge locations)

suppressPackageStartupMessages(library(mapview))

ws_bound <- ms_load_spatial_product(project_root, 'ws_boundary', domain = 'boulder')
prcp_gauges <- ms_load_spatial_product(project_root, 'stream_gauge_locations', domain = 'boulder')
strm_gauges <- ms_load_spatial_product(project_root, 'precip_gauge_locations', domain = 'boulder')

mapview(ws_bound) + mapview(prcp_gauges) + mapview(strm_gauges, col.regions = rainbow(n = 3))

More vignettes

Vignettes will only load if you installed macrosheds with build_vignettes=TRUE, but they’re also hosted as markdown files here. These provide tutorials on data retrieval, flux calculation, watershed delineation, and precip interpolation.

vignette(package = 'macrosheds')
vignette('ms_watershed_delineation', package = 'macrosheds')
?ms_synchronize_timestep  # upsample or downsample macrosheds data
?ms_calc_watershed_precip # spatial interpolation of precip gauge data
?ms_separate_baseflow     # baseflow/stormflow separation via hydrostats

Part 4: linking to the CAMELS datasets

Download CAMELS-US

(just watershed descriptors and streamflow time series)

camels_dir <- file.path(project_root, 'camels')
dir.create(camels_dir, showWarnings = FALSE)

attr_categories <- c('clim', 'geol', 'soil', 'topo', 'vege', 'hydro')

#watershed attributes
for(x in attr_categories){
    download.file(paste0('https://gdex.ucar.edu/dataset/camels/file/camels_', x, '.txt'),
                  destfile = paste0(camels_dir, '/', x, '.txt'))
}

# #Daymet forcings
# download.file('https://gdex.ucar.edu/dataset/camels/file/basin_timeseries_v1p2_modelOutput_daymet.zip',
#               destfile = file.path(camels_dir, 'daymet.zip'))

#observed flow
# download.file('https://gdex.ucar.edu/dataset/camels/file/basin_timeseries_v1p2_metForcing_obsFlow.zip',
#               destfile = file.path(camels_dir, 'flow.zip'))
# unzip(file.path(camels_dir, 'flow.zip')) #untested. be aware of the truncation issue described in ?unzip

Visualize CAMELS

view_attr_dists <- function(path_to_camels){

    char_cols <- c('dom_land_cover', 'geol_1st_class', 'geol_2nd_class',
                   'high_prec_timing', 'low_prec_timing')
    
    vege <- read_delim(file.path(path_to_camels, 'vege.txt'), delim = ';')
    topo <- read_delim(file.path(path_to_camels, 'topo.txt'), delim = ';')
    geol <- read_delim(file.path(path_to_camels, 'geol.txt'), delim = ';')
    soil <- read_delim(file.path(path_to_camels, 'soil.txt'), delim = ';')
    clim <- read_delim(file.path(path_to_camels, 'clim.txt'), delim = ';')
    hydro <- read_delim(file.path(path_to_camels, 'hydro.txt'), delim = ';')

    categories <- c(rep('vege', ncol(vege) - 1),
                    rep('topo', ncol(topo) - 1),
                    rep('geol', ncol(geol) - 1),
                    rep('soil', ncol(soil) - 1),
                    rep('clim', ncol(clim) - 1),
                    rep('hydr', ncol(hydro) - 1))
    
    d <- reduce(list(vege, topo, geol, soil, clim, hydro), full_join, by = 'gauge_id') %>% 
        # select(-matches(!!char_cols)) %>% 
        mutate(across(-c(gauge_id, !!char_cols), as.numeric))
        
    category_colors <- c('vege' = 'forestgreen',
                         'topo' = 'black',
                         'geol' = 'gray50',
                         'soil' = 'orange4',
                         'clim' = 'darkblue',
                         'hydr' = 'turquoise3')
    
    nrows <- ceiling(sqrt(length(categories)))
    par(mfrow = c(nrows, length(categories) / nrows),
        mar = c(1, 2, 0, 0), oma = rep(0.5, 4))
    
    plot(1, 1, type = 'n', axes = F)
    legend('left', legend = names(category_colors), col = unname(category_colors),
           lty = 1, lwd = 3, bty = 'n')
    legend('right', legend = c('CAMELS', 'MacroSheds'),
           fill = c('gray70', 'darkviolet'), bty = 'n', border = NA)
    
    for(i in 2:ncol(d)){
        
        varname <- colnames(d)[i]
        if(! varname %in% char_cols){
            
            dvar <- d %>% 
                select(gauge_id, !!varname) %>% 
                filter(! is.na(!!varname)) %>% 
                arrange(!!sym(varname))
            
            source_color <- if_else(grepl('^[0-9]{6,}$', dvar$gauge_id), 'gray70', 'darkviolet')
            color <- category_colors[categories[i - 1]]
            # color <- category_colors[names(category_colors) == categories[i - 1]]
            
            barplot(dvar[[varname]], col = source_color, border = source_color)
            box(col = color, lwd = 4)
            mtext(varname, 3, line = -1.5, col = 'black')
        }
    }
}

view_q_dists <- function(path_to_camels){
    
    flowdir <- file.path(path_to_camels, 'basin_dataset_public_v1p2/usgs_streamflow')
    flowfiles <- list.files(flowdir, recursive = TRUE, pattern = 'streamflow_qc\\.txt$',
                            full.names = TRUE)
    
    par(mfrow = c(1, 1), mar = c(2, 4, 1, 1))
    plot(as.Date(c('1930-01-01', '2023-07-30')), log(c(0.001, 1e6)), type = 'n',
         xlab = '', ylab = 'Log discharge (cfs)', yaxt = 'n')
    yaxis <- c(0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 1e5, 1e6)
    axis(2, log(yaxis), labels = as.character(yaxis))
    
    for(f in flowfiles){
        
        flow <- read.fwf(
            f,
            widths = c(9, 4, 3, 3, 9, 2),
            header = FALSE,
            strip.white = TRUE,
            colClasses = 'character',
            col.names = c('gauge_id', 'year', 'month', 'day', 'Q_cfs', 'flag')
        ) %>%
            as_tibble() %>%
            mutate(date = as.Date(paste(year, month, day), format = '%Y %m %d'),
                   Q_cfs = log(as.numeric(Q_cfs))) %>%
            mutate(Q_cfs = if_else(Q_cfs == -999, NA_real_, Q_cfs))
        
        lines(flow$date, flow$Q_cfs, col = adjustcolor('black', alpha.f = 0.3))
        
                   #      mm/d        cfs         m^3/ft^3      mm/m   s/d     m^2
                   # discharge = discharge * 0.028316846 * 1000 * 86400 / ws_area_m2)
    }
}

view_attr_dists(camels_dir)

Merge MacroSheds and CAMELS

ms_download_ws_attr(project_root, dataset = 'CAMELS summaries')
  Downloading dataset type: CAMELS summaries (1/1; download id 39944257)
  watershed_summaries_CAMELS.feather successfully downloaded to ~/macrosheds_workshop
ms_cmls <- ms_load_product(project_root, prodname = 'ws_attr_CAMELS_summaries') %>% 
    rename(geol_porostiy = geol_porosity, #accommodate typo in CAMELS dataset
           q5 = Q5, q95 = Q95, baseflow_index = baseflow_index_landson) #and some slight name differences

#proportion of missing values per column
apply(ms_cmls, MARGIN = 2, function(x) round(sum(! is.na(x)) / length(x), 2))
             site_code               domain              network 
                  1.00                 1.00                 1.00 
                p_mean             pet_mean              aridity 
                  0.93                 0.93                 0.93 
         p_seasonality            frac_snow       high_prec_freq 
                  0.93                 0.93                 0.93 
         high_prec_dur     high_prec_timing        low_prec_freq 
                  0.93                 0.93                 0.93 
          low_prec_dur      low_prec_timing       geol_1st_class 
                  0.93                 0.93                 0.00 
   glim_1st_class_frac       geol_2nd_class  glim_2nd_class_frac 
                  0.97                 0.00                 0.97 
  carbonate_rocks_frac        geol_porostiy    geol_permeability 
                  0.97                 0.91                 0.91 
             sand_frac            silt_frac            clay_frac 
                  0.76                 0.76                 0.76 
          organic_frac            gauge_lat            gauge_lon 
                  0.76                 1.00                 1.00 
                  area            elev_mean           slope_mean 
                  0.99                 0.99                 0.99 
           frac_forest  dom_land_cover_frac       dom_land_cover 
                  0.89                 0.89                 0.89 
         root_depth_50        root_depth_99               q_mean 
                  0.89                 0.89                 0.93 
          runoff_ratio          stream_elas            slope_fdc 
                  0.93                 0.93                 0.87 
        baseflow_index             hfd_mean                   q5 
                  0.93                 0.93                 0.93 
                   q95          high_q_freq           high_q_dur 
                  0.93                 0.93                 0.93 
            low_q_freq            low_q_dur          zero_q_freq 
                  0.93                 0.93                 0.93
camels_ms_dir <- file.path(project_root, 'camels_macrosheds_merge')
dir.create(camels_ms_dir, showWarnings = FALSE)

cmls_topo <- read_delim(file.path(camels_dir, 'topo.txt'), delim = ';') %>% 
    rename(area = area_gages2) %>% #could also use area_geospa_fabric here
    mutate(across(everything(), as.character))
    
ms_cmls %>% 
    select(gauge_id = site_code, matches(colnames(cmls_topo)), 'area') %>%
    mutate(across(everything(), as.character)) %>% 
    bind_rows(cmls_topo) %>% 
    write_delim(file.path(camels_ms_dir, 'topo.txt'), delim = ';')

camels_ms_merge <- function(dataset, read_dir, write_dir){
    
    #dataset: character; one of "vege", "topo", "soil", "geol", "hydro", "clim"
    
    cmls_d <- read_delim(paste0(read_dir, '/', dataset, '.txt'), delim = ';') %>% 
        mutate(across(everything(), as.character))
    
    missing_vars <- setdiff(colnames(cmls_d), colnames(ms_cmls)) %>% 
        str_subset('gauge_id', negate = TRUE)
    
    if(length(missing_vars)){
        print(paste('MacroSheds is missing some CAMELS variables:',
                    paste(missing_vars, collapse = ', ')))
    }
    
    ms_cmls %>% 
        select(gauge_id = site_code, matches(colnames(cmls_d))) %>%
        mutate(across(everything(), as.character)) %>% 
        bind_rows(cmls_d) %>% 
        write_delim(paste0(write_dir, '/', dataset, '.txt'), delim = ';')
}

camels_ms_merge('vege', read_dir = camels_dir, write_dir = camels_ms_dir)
  [1] "MacroSheds is missing some CAMELS variables: lai_max, lai_diff, gvf_max, gvf_diff"
camels_ms_merge('geol', read_dir = camels_dir, write_dir = camels_ms_dir)
camels_ms_merge('soil', read_dir = camels_dir, write_dir = camels_ms_dir)
  [1] "MacroSheds is missing some CAMELS variables: soil_depth_pelletier, soil_depth_statsgo, soil_porosity, soil_conductivity, max_water_content, water_frac, other_frac"
camels_ms_merge('clim', read_dir = camels_dir, write_dir = camels_ms_dir)
camels_ms_merge('hydro', read_dir = camels_dir, write_dir = camels_ms_dir)

Visualize again

view_attr_dists(camels_ms_dir)