Before the workshop

Please install the following packages:

#macrosheds
devtools::install_github('https://github.com/MacroSHEDS/macrosheds.git')

#tidyverse (dplyr, stringr, readr, etc.)
install.packages('tidyverse')

Then download the MacroSheds data we will use

library(macrosheds)

# this is where files will be downloaded. feel free to change it.
project_root <- '~/macrosheds_workshop'

# datasets we will use
ms_download_ws_attr(project_root, dataset = 'summaries')
ms_download_core_data(project_root, domains = c('niwot', 'plum', 'east_river', 'santee'))

These packages are optional (only used here and there, and the overall sequence doesn’t depend on them)

install.packages('xts')         # time series representation
install.packages('dygraphs')    # interactive plots
install.packages('viridis')     # colorblind-friendly color palettes
install.packages('factoextra')  # ordination, etc.
install.packages('mapview')     # interactive maps
install.packages('data.table')  # efficient computation
install.packages('whitebox')    # geospatial processing
whitebox::install_whitebox() #additional whitebox binaries. if this fails, use the next line
# whitebox::install_whitebox(platform = 'linux_musl')

MacroSheds

long-term watershed ecosystem studies, harmonized

Goals

  • enable calculation of input/output solute fluxes from diverse watersheds
  • lower barriers to accessing diverse watershed data through data harmonization and visualization
  • investigate:
    • variation in magnitude, timing, form of N exports
    • broad effects of watershed acidification
    • variation in mineral weathering
    • watershed sensitivity to climate change

Terms

  • site: an individual gauging station or stream sampling location and its watershed
  • domain: one or more sites under common management
  • network: one or more domains under common funding/leadership
  • flux: solute mass normalized to watershed area, over time (kg/ha/d)

Part 1: Setup and overview

suppressPackageStartupMessages(library(tidyverse))
library(macrosheds)
  
  macrosheds v1.0.2: Up to date ✓
  
  This package is licensed under MIT, but licensing of MacroSheds data is more complex. See 
  https://docs.google.com/document/d/1CPaQ705QyoWfu6WjA4xHgQooCQ8arq3NZ8DO9tb9jZ0/edit?usp=sharing
  
  Complete metadata: https://portal.edirepository.org/nis/mapbrowse?scope=edi&identifier=1262
options(readr.show_progress = FALSE,   #prevent gratuitous printing from read_csv
        readr.show_col_types = FALSE,  #same
        timeout = 600)                 #allow more time for dataset downloads

project_root <- '~/macrosheds_workshop'

Retrieval of basic site data, watershed summary attributes

ms_sites <- ms_load_sites() %>% 
    filter(site_type == 'stream_gauge') #some precip gauges have the same names as stream gauges!

?ms_load_variables
ms_ts_variables <- ms_load_variables(var_set = 'timeseries')

?ms_download_ws_attr
# ms_download_ws_attr(project_root, dataset = 'summaries')

?ms_load_product
ws_smry <- ms_load_product(project_root, prodname = 'ws_attr_summaries')

Visualize distributions of MacroSheds watershed summary attributes

Principal component analysis of watershed summary attributes

suppressPackageStartupMessages(library(factoextra))

pca_data <- ws_smry %>% 
    select(where( ~sum(is.na(.)) / length(.) < 0.3)) %>% #drop columns with >= 30% missing values
    drop_na()                                            #drop rows with any missing values

domains <- pca_data$domain

pca_data <- pca_data %>% 
    select(-site_code, -domain, -network) %>% 
    select(where(~sd(., na.rm = TRUE) != 0)) %>%         #drop columns with no variance
    as.matrix()

smry_categories <- substr(colnames(pca_data), 1, 1)
category_map <- c('c' = 'climate', 'h' = 'hydrology', 'l' = 'landcover',
                  'p' = 'parentmat', 't' = 'terrain', 'v' = 'vegetation',
                  'w' = 'ws area')
smry_categories <- factor(category_map[match(smry_categories, names(category_map))])

pca <- prcomp(pca_data, center = TRUE, scale. = TRUE)

fviz_eig(pca)

fviz_pca_biplot(pca, geom.var = 'arrow', geom.ind = 'point', title = '',
                col.var = smry_categories, palette = 'Dark2')

fviz_pca_biplot(pca, geom.var = '', geom.ind = 'point', title = '',
                col.ind = as.factor(domains))

Identify domains at elevational extremes (2 of each)

elev_extreme_domains <- ws_smry %>% 
    filter(! is.na(te_elev_mean)) %>% #no watersheds delineated for McMurdo LTER
    group_by(domain) %>% 
    summarize(domain_mean_elev = mean(te_elev_mean)) %>% 
    ungroup() %>% 
    arrange(desc(domain_mean_elev)) %>% 
    slice(c(1:2, (n() - 1):n())) %>% #2 highest and 2 lowest domains, by average watershed elevation
    print() %>%
    pull(domain)
  # A tibble: 4 × 2
    domain     domain_mean_elev
    <chr>                 <dbl>
  1 niwot               3593.  
  2 east_river          3344.  
  3 plum                  31.7 
  4 santee                 9.10

Load site-variable catalog; filter by domain

sitevars <- ms_load_variables('timeseries_by_site')

sitevars <- filter(sitevars,
                   domain %in% elev_extreme_domains,
                   chem_category == 'stream_conc')

length(unique(sitevars$site_code)) # n sites
  [1] 43

What variables are shared by at least one site from each of these 4 domains?

get_shared_domain_vars <- function(df){

    df %>% 
        group_split(domain) %>% 
        map(~ pluck(.x, 'variable_code')) %>% 
        reduce(intersect)
}

get_shared_domain_vars(sitevars)
  [1] "Cl"    "DOC"   "PO4_P" "SO4_S" "TDN"

Again, but filter for variables recorded for >= 2 years, at >= average rate of 36 obs/year

sitevars <- sitevars %>% 
    mutate(ndays = difftime(last_record_utc, first_record_utc, unit = 'days')) %>% 
    filter(ndays >= 2 * 365.25,
           mean_obs_per_day * 365.25 >= 36)

get_shared_domain_vars(sitevars)
  [1] "Cl"    "DOC"   "PO4_P" "SO4_S" "TDN"

Retrieve core time-series data

?ms_download_core_data
# ms_download_core_data(project_root, domains = elev_extreme_domains)

?ms_load_product
(doc <- ms_load_product(
    project_root,
    prodname = 'stream_chemistry',
    filter_vars = 'DOC',
    domain = elev_extreme_domains,
    warn = FALSE
))
  # A tibble: 162,001 × 7
     datetime            site_code var      val ms_status ms_interp val_err
     <dttm>              <chr>     <chr>  <dbl>     <dbl>     <dbl>   <dbl>
   1 2015-11-09 00:00:00 Avery     GN_DOC 0.38          0         0  0.0001
   2 2015-11-10 00:00:00 Avery     GN_DOC 0.374         0         1  0.0001
   3 2015-11-11 00:00:00 Avery     GN_DOC 0.367         0         1  0.0001
   4 2015-11-12 00:00:00 Avery     GN_DOC 0.361         0         1  0.0001
   5 2015-11-13 00:00:00 Avery     GN_DOC 0.354         0         1  0.0001
   6 2015-11-14 00:00:00 Avery     GN_DOC 0.348         0         1  0.0001
   7 2015-11-15 00:00:00 Avery     GN_DOC 0.341         0         1  0.0001
   8 2015-11-16 00:00:00 Avery     GN_DOC 0.335         0         1  0.0001
   9 2015-11-17 00:00:00 Avery     GN_DOC 0.329         0         1  0.0001
  10 2015-11-18 00:00:00 Avery     GN_DOC 0.322         0         1  0.0001
  # ℹ 161,991 more rows
unique(doc$var)
  [1] "GN_DOC"
table(doc$ms_status)
  
       0      1 
  161930     71
table(doc$ms_interp)
  
       0      1 
  126140  35861

Visualize stream discharge

library(viridis)
  Loading required package: viridisLite
plot_q <- function(ms_root, domain){

    q <- ms_load_product(ms_root, prodname = 'discharge', domains = domain)
    
    current_year <- as.numeric(strftime(Sys.Date(), format = '%Y'))
    earliest_year <- as.numeric(strftime(min(q$datetime), format = '%Y'))
    nyears <- current_year - earliest_year
    yrcols <- viridis(n = nyears)

    sites <- unique(q$site_code)

    plotrc <- ceiling(sqrt(length(sites)))
    doyseq <- seq(1, 366, 30)
    par(mfrow = c(plotrc, plotrc), mar = c(1, 2, 0, 0), oma = c(0, 0, 2, 0))

    for(s in sites){

        plot(NA, NA, xlim = c(1, 366), ylim = c(0, nyears), xaxs = 'i', yaxs = 'i',
             ylab = '', xlab = '', yaxt = 'n', cex.axis = 0.6, xaxt = 'n', xpd = NA)
        axis(1, doyseq, doyseq, tick = FALSE, line = -2, cex.axis = 0.8)
        axis(2, 1:nyears, earliest_year:(current_year - 1), las = 2, cex.axis = 0.6,
             hadj = 0.7)

        qsub <- q %>%
            filter(domain == domain, site_code == s) %>%
            mutate(doy = as.numeric(strftime(datetime, format = '%j', tz = 'UTC')),
                   yr_offset = lubridate::year(datetime) - earliest_year)

        lubridate::year(qsub$datetime) <- 1972
        yrs <- unique(qsub$yr_offset)

        for(i in 1:length(yrs)){
            qss <- qsub %>%
                filter(yr_offset == yrs[i]) %>%
                arrange(doy)
            lines(qss$doy, c(scale(qss$val)) + qss$yr_offset, col = yrcols[i])
        }

        mtext(s, 3, outer = FALSE, line = -2)
    }

    mtext(paste0(domain, ' (DOY vs. Year)'), 3, outer = TRUE)
}

plot_q(project_root, 'boulder')

# plot_q(project_root, 'fernow')
plot_q(project_root, 'baltimore')