Match Bathymetry and Raster Grids
Read in Data
Bathymetry
bathy_og<- raster("Data/raw/Multibeam/ElbowDec2015Cube_2m.tiff") #Original bathymetry
class : RasterLayer
dimensions : 13948, 4908, 68456784 (nrow, ncol, ncell)
resolution : 2, 2 (x, y)
extent : 184571, 194387, 3064261, 3092157 (xmin, xmax, ymin, ymax)
crs : +proj=utm +zone=17 +datum=WGS84 +units=m +no_defs
source : C:/Users/socce/Documents/Grad_School/Research/R_Projects/Elbow_Paper/Data/raw/Multibeam/ElbowDec2015Cube_2m.tiff
names : ElbowDec2015Cube_2m
values : -77.40717, -44.16052 (min, max)
Backscatter
backscatter_og<- raster("Data/raw/Multibeam/EL1_1mTimeSeriesBS_TrimmedtoMosaic_AVG800.tiff")
class : RasterLayer
dimensions : 26788, 8382, 224537016 (nrow, ncol, ncell)
resolution : 1, 1 (x, y)
extent : 185107.5, 193489.5, 3064871, 3091659 (xmin, xmax, ymin, ymax)
crs : +proj=utm +zone=17 +datum=WGS84 +units=m +no_defs
source : C:/Users/socce/Documents/Grad_School/Research/R_Projects/Elbow_Paper/Data/raw/Multibeam/EL1_1mTimeSeriesBS_TrimmedtoMosaic_AVG800.tiff
names : EL1_1mTimeSeriesBS_TrimmedtoMosaic_AVG800
values : -13.84911, -5.748498 (min, max)
EL_shp<- readOGR("Data/derived/shapefiles/EL_Shp.shp")
[1] "SpatialPolygonsDataFrame"
attr(,"package")
[1] "sp"
[1] "Extent: 777033 783941 3065379 3090118 (xmin, xmax, ymin, ymax)"
[1] "crs: +proj=utm +zone=16 +datum=WGS84 +units=m +no_defs"
Resample to macthing 10m grid
bathy<- projectRaster(from = bathy_og, res = 10, crs = make_crs("UTM 16N")) #Reproject/resample
bathy<- focal(bathy, w=matrix(data = 1, nrow = 3, ncol = 3), fun=mean, na.rm=TRUE, NAonly=TRUE) #Fill holes
names(bathy)<- "bathy"
backscatter<- projectRaster(from= backscatter_og, to =bathy) #Reproject/resample
backscatter<- focal(backscatter, w=matrix(data = 1, nrow = 3, ncol = 3), fun=mean, na.rm=TRUE, NAonly=TRUE) #Fill holes
names(backscatter)<- "backscatter"
Trim data to mask
bathy<- mask(x = bathy, EL_shp)
backscatter<- mask(x = backscatter, EL_shp)
Finalized Surfaces
Figure S1.2

Calculate Derivative Features
Determine scales of analysis
The Fibonacci sequence is a good way to systematically cover multiple-scales (Wilson, 2007).
my_window_sizes<- 1+ (2*fib_seq(10)[-c(1,2)])
print(my_window_sizes)
[1] 3 5 7 11 17 27 43 69
We will evaluate predictors at 8 different window sizes from 3x3 to 69x69
Bathymetric Derivative Features
Calculate bathymetric derivatives at various scales of analysis. This includes the slope, aspect (split into Northness and Eastness), Topographic Position Index (Whether an area is a local high or low), mean bathymetry, and standard deviation of bathymetry (a measure of rugosity).
slope<- terrain(bathy, opt= "slope", unit = "degrees")
names(slope)<- "slope_native"
aspect<- terrain(bathy, opt = c("aspect"), unit = "radians")
eastness<- sin(aspect)
names(eastness)<- "eastness_native"
northness<- cos(aspect)
names(northness)<- "northness_native"
for (i in my_window_sizes) {
print(i)
w<- matrix(data = 1, nrow = i, ncol = i)
curr_slope<- focal(x = slope$slope_native, w, fun=mean, na.rm=TRUE, pad=TRUE)
names(curr_slope)<- paste0("slope_",as.character(i),"x", as.character(i))
slope<- addLayer(slope, curr_slope)
curr_eastness<- focal(x = eastness$eastness_native, w, fun=mean, na.rm=TRUE, pad=TRUE)
names(curr_eastness)<- paste0("eastness_",as.character(i),"x", as.character(i))
eastness<- addLayer(eastness, curr_eastness)
curr_northness<- focal(x = northness$northness_native, w, fun=mean, na.rm=TRUE, pad=TRUE)
names(curr_northness)<- paste0("northness_",as.character(i),"x", as.character(i))
northness<- addLayer(northness, curr_northness)
curr_tpi<- TPI(bathy, d=c(i,i), na.rm=TRUE, pad = TRUE)
names(curr_tpi)<- paste0("tpi_",as.character(i),"x", as.character(i))
curr_BathyMean<- focal(x = bathy, w, fun=mean, na.rm=TRUE, pad=TRUE)
names(curr_BathyMean)<- paste0("BathyMean_",as.character(i),"x", as.character(i))
curr_BathySD<- focal(x = bathy, w, fun=sd, na.rm=TRUE, pad=TRUE)
names(curr_BathySD)<- paste0("BathySD_",as.character(i),"x", as.character(i))
if(i==3){
tpi<- curr_tpi
BathyMean<- curr_BathyMean
BathySD<- curr_BathySD
} else{
tpi<- addLayer(tpi, curr_tpi)
BathyMean<- addLayer(BathyMean, curr_BathyMean)
BathySD<- addLayer(BathySD, curr_BathySD)
}
}
bathy_deriv<- stack(BathyMean, BathySD, tpi, slope, eastness, northness)
bathy_deriv<- mask(x = bathy_deriv, EL_shp) #Trim back down b/c used pad and na.rm
Backscatter Derivative Features
Calculate backscatter derivatives at various scales of analysis. This includes the mean and standard deviation of bathymetry as well as texture metric derived from a Gray Level Co-occurence Matrix (GLCM).
The most common texture metrics from remote sensing were derived using the formulas from (Hall-Beyer 2017). Rotationally Invariant/Directionally Isotropic Harilick Texture Metrics were derived by calculating a symetrical GLCM with 32 gray levels. The texture metrics can be broken down into three groups: The contrast group (contrast, dissimilarity, and homogeneity), the orderliness group (ASM and Entropy), and the descriptive statistics group (mean, variance, and correlation;Hall-Beyer, 2017). At smaller window sizes, the correlation texture can sometimes be undefined (0/0) leading to holes in the resulting raster surfaces, so correlation at 3x3, 5x5, and 7x7 were removed from the analysis to prevent this.
backscatter_quant<- quantize_raster(backscatter, n_levels = 32,method = "equal prob")
for (i in my_window_sizes) {
print(i)
w<- matrix(data = 1, nrow = i, ncol = i)
curr_bs_mean<- focal(backscatter, w, fun=mean, na.rm=TRUE, pad=TRUE)
names(curr_bs_mean)<- "BackscatterMean"
curr_bs_sd<- focal(backscatter, w, fun=sd, na.rm=TRUE, pad=TRUE)
names(curr_bs_sd)<- "BackscatterSD"
curr_BSderiv<- stack(curr_bs_mean,
curr_bs_sd,
glcm_textures(backscatter_quant, w= c(i,i), n_levels=32, shift = list(c(1, 0), c(1, 1), c(0, 1), c(-1, 1)), metrics = c("glcm_contrast", "glcm_dissimilarity", "glcm_homogeneity", "glcm_ASM",
"glcm_entropy", "glcm_mean", "glcm_variance", "glcm_correlation"), quantization= "none", na_opt = "center", pad = TRUE))
names(curr_BSderiv)<- paste0(names(curr_BSderiv), "_", as.character(i), "x", as.character(i))
if(i==3){
backscatter_deriv<- curr_BSderiv} else{
backscatter_deriv<- stack(backscatter_deriv, curr_BSderiv)
rm(curr_BSderiv)
}
}
backscatter_deriv<- mask(backscatter_deriv, EL_shp)
backscatter_deriv<- dropLayer(backscatter_deriv, which(names(backscatter_deriv)=="glcm_correlation_3x3"))
backscatter_deriv<- dropLayer(backscatter_deriv, which(names(backscatter_deriv)=="glcm_correlation_5x5"))
backscatter_deriv<- dropLayer(backscatter_deriv, which(names(backscatter_deriv)=="glcm_correlation_7x7")) #remove these b/c have NA's that create holes due to zero in denominator
Georeference Ground-Truth Data
Transects
Solid lines represent transects for training data, and dashed line represents the transect where observations were withheld in order to test the accuracy of substrate predictions
Figure S1.3

Read in Video Habitat Observations
Read in Timestamps for Camera Images
cam<- read_csv("Data/derived/CBASS/camera.csv")
Read in Position and Ancillary Data Streams
Hypack<- read_hypack(list.files("Data/raw/Hypack/", pattern="\\.RAW$", full.names = TRUE))
cable_out<- read_tsv("Data/raw/payout/payout_Feb&Oct2016.tsv", col_types = list(.default=col_double(), timestamp=col_datetime()))
Lat_Long<- get_HypackPos(Hypack)
names(Lat_Long)[2:3]<- c("ShipLon", "ShipLat")
Ship_Speed<- get_HypackSpeed(Hypack)
T1_oneHz<- make_1hz(alt_path = "Data/raw/CBASS/sensors/T1_D4/altimeter_readings.tsv", compass_path = "Data/raw/CBASS/sensors/T1_D4/compass_readings.tsv", ctd_path = "Data/raw/CBASS/sensors/T1_D4/ctd_redo_sal.tsv", Lat_Long = Lat_Long, Ship_Speed = Ship_Speed, cable_out = cable_out)
T3_oneHz<- make_1hz(alt_path = "Data/raw/CBASS/sensors/T3_D1/altimeter_readings.tsv", compass_path = "Data/raw/CBASS/sensors/T3_D1/compass_readings.tsv", ctd_path = "Data/raw/CBASS/sensors/T3_D1/ctd_redo_sal.tsv", Lat_Long = Lat_Long, Ship_Speed = Ship_Speed, cable_out = cable_out)
T5_oneHz<- make_1hz(alt_path = "Data/raw/CBASS/sensors/T5_D14/altimeter_readings.tsv", compass_path = "Data/raw/CBASS/sensors/T5_D14/compass_readings.tsv", ctd_path = "Data/raw/CBASS/sensors/T5_D14/ctd_redo_sal.tsv", Lat_Long = Lat_Long, Ship_Speed = Ship_Speed, cable_out = cable_out)
T6_oneHz<- make_1hz(alt_path = "Data/raw/CBASS/sensors/T6_D13/altimeter_readings.tsv", compass_path = "Data/raw/CBASS/sensors/T6_D13/compass_readings.tsv", ctd_path = "Data/raw/CBASS/sensors/T6_D13/ctd_redo_sal.tsv", Lat_Long = Lat_Long, Ship_Speed = Ship_Speed, cable_out = cable_out)
one_hz<- bind_rows(mutate(T1_oneHz, Transect="T1"),
mutate(T3_oneHz, Transect="T3"),
mutate(T5_oneHz, Transect="T5"),
mutate(T6_oneHz, Transect="T6"))
one_hz
Estimate C-BASS Position
CBASS Position is estimated using Pythagorean theorem using the cable out (the hypotenuse) and CBASS depth (the height of the triangle). The cable out is zeroed when the the CBASS hits the water so to get true cable out from the trawl block, and the true height of this triangle, the height of the trawl block above the water (~6.1 m) is added to the recorded cable out and CBASS Depth. The layback behind the block is then calculated using pythagorean theorum. The y offset (fore/aft distance; ~24.4 m) between the trawl block and the GPS antenna is then added to this layback to get the layback of the CBASS system behind the logged GPS ship position. CBASS is assumed to follow the ship track but with a time delay. The layback distance is converted to a time delay by dividing the layback distance by the average ship speed over the past minute. To get the CBASS position for a given time, the the ship position from “x” number of seconds ago (the time delay) is grabbed from the table and assigned as the current CBASS position. For more information see the documentation and source code for mytools::calc_layback (arguments used GPS_Source=“Nothstar”, zeroed=“water”, cat_fact=1).
one_hz<- one_hz %>% mutate(Layback_m= calc_layback(payout = payout, depth = depth, GPS_Source = "Northstar", zeroed = "water", cat_fact = 1))
one_hz<- one_hz %>% mutate(Layback_sec= round(Layback_m/Ship_Speed_mps_1minAvg))
one_hz<- one_hz %>% mutate(TimeToMatch=timestamp-dseconds(Layback_sec))
temp_one_hz<- one_hz %>% select(timestamp, Transect, ShipLon, ShipLat)
names(temp_one_hz)<- c("TimeToMatch", "Transect", "CBASSLon", "CBASSLat")
one_hz<- one_hz %>% left_join(temp_one_hz, by=c("TimeToMatch","Transect")) %>% select(-TimeToMatch)
rm(temp_one_hz)
one_hz
write_csv(one_hz, "Data/derived/CBASS/onehz.csv")
Georeference Video Habitat Observations
Calculate Frame number. Videos are 1 minute long at 12 frames per second.
hab<- hab %>% mutate(Frame_num= (Vid * 12 * 60) + Sec*12)
Get timestamp for each frame to nearest second
hab<- hab %>% left_join(select(cam, timestamp, u_second, Frame_num, Transect), by= c("Transect", "Frame_num"))
hab<- hab %>% mutate(timestamp=round_date(timestamp + dmicroseconds(u_second), unit = "second")) %>% select(-u_second)
Link to position by timestamp
hab<- hab %>% left_join(select(one_hz, timestamp, Transect, CBASSLat, CBASSLon), by=c("timestamp", "Transect"))
hab
write_csv(hab, path = "Data/derived/CBASS/hab.csv")
Prepare training and validation set
Only include habitats that were the same as their previous and subsequent observations to remove transitional/mixed areas to create set used for training and validation.
TV_set<- hab %>% group_by(Transect) %>% const_hab("Substrate") %>% ungroup()
print(unique(TV_set$Substrate))
[1] "Sand" "Rock" "No_Vis"
TV_set<- TV_set %>% filter(Substrate!="No_Vis") #Remove bad visibility
TV_set$Substrate<- factor(TV_set$Substrate, levels = c("Rock", "Sand"))
TV_set<- TV_set %>% mutate(Set=ifelse(test = Transect=="T1", yes = "Validation", no = "Training")) #Reserve T1 for validation of results. Use other transects for training data
Create Spatial Object
TV_set<- TV_set %>% filter(!is.na(CBASSLat)) #Remove where there is no positioning data
TV_set<- spTransform(SpatialPointsDataFrame(coords = cbind(TV_set$CBASSLon, TV_set$CBASSLat),
data = as.data.frame(select(TV_set, -c(CBASSLon, CBASSLat))),
proj4string = make_crs("WGS84")), CRSobj = make_crs("UTM 16N"))
TV_set<- TV_set[EL_shp,] #Remove points outside beyond bounds of the study area
The ground-truth dataset consisted of 3680 observations, where each observation was the substrate classification determined from the video at 15 second intervals. This consisted of 473 observations of rock, 3195 observations of sand, and 12 observations where substrate was not discernible. After censoring substrate determinations that differed from their previous and subsequent observations, those where substrate was not visible, and observations beyond the bounds of the multibeam survey or did not have associated positioning data, there were 238 observations of rock and 2533 observations of sand. These data were then split into training and validation sets. The training data set consisted of 210observations of rock and 1947 observations of sand. The validation transect consisted of 28 observations of rock and 586 observations of sand.
Figure S1.4

hab_onehz_joined<- hab %>% left_join(one_hz)
Transects total to 109 km in length
Overlaying vertical relief on substrate
Because higher relief features relatively rare, and are fairly small relative to the error associated with positioning of the CBASS, we did not directly try to predict this in the supervised and unsupervised models. However, we can measure vertical relief as a difference between the minimum and maximum values. Total depth was calculated from the CBASS sensors as CBASS depth + CBASS altitude (where altitude measurements have been adjusted for the pitch of the system). Vertical relief for each 15 second bin was calculated as the maximum change in depth over that 15 second observation window. A box plot of the assigned visual relief class, and the measured vertical relief from the sensors is shown below. In blue, orange, and red are the relief cutoffs for low, moderate, and high relief respectively as proposed by Smith et al (2011) in their reef fish survey of the Florida Keys. This appears to match up fairly well with our observations, so we will use these cutoffs: low relief <= 1m ; 1m < moderate relief <= 2m; high relief > 2m.
Figure S1.13
one_hz<- one_hz %>% mutate(Total_Depth= depth+ altitude*cos(radians(pitch))) #Calculate Total depth
one_hz<- one_hz %>% mutate(vertical_relief_15s = NA_real_)
for (i in 8:(nrow(one_hz)-7)) {
curr_idx<- (i-7):(i+7)
if(length(unique(one_hz$Transect[curr_idx]))!=1){
next() #Prevent averaging across different transects
}
dvalues<- one_hz$Total_Depth[curr_idx]
dmin<- min(dvalues, na.rm=TRUE)
dmax= max(dvalues, na.rm=TRUE)
vrelief<- dmax-dmin
if(is.infinite(vrelief)){vrelief<-NA}
one_hz$vertical_relief_15s[i]<- vrelief
}
hab_relief<- hab %>% left_join(one_hz, by= c("timestamp","Transect")) %>% select(timestamp, Transect, Relief, vertical_relief_15s) %>% filter(Relief!="None" & Relief != "No_Vis")
hab_relief$Relief<- factor(hab_relief$Relief, levels=c("Low_Relief", "Moderate_Relief", "High_Relief"))
visual_relief_plot<- ggplot(data = hab_relief, mapping = aes(x= Relief, y=vertical_relief_15s))+
annotate("rect", xmin = -Inf, xmax = Inf, ymin = 0, ymax = 1, fill= "dodgerblue", alpha=.2)+
annotate("rect", xmin = -Inf, xmax = Inf, ymin = 1, ymax = 2, fill= "orange", alpha=.2)+
annotate("rect", xmin = -Inf, xmax = Inf, ymin = 2, ymax = Inf, fill= "red", alpha=.2)+
geom_boxplot()+
scale_y_continuous(breaks = seq(0, 8, by= 1), expand = c(0,0))+
ylab("Vertical Relief (m) over 15 s")+
xlab("Relief Class")
visual_relief_plot

These thresholds are then used to delineate higher relief areas directly from the bathymetry by calculating the difference between a central pixel and the minimum depth of surrounding pixels using a sliding 3x3 window.
Figure S1.14
w<- matrix(data = 1, nrow = 3,ncol = 3) #3x3 window
w[5]<- NA
min_bathy<- focal(x = bathy, w=w, fun=min, na.rm=TRUE) #Calculate minimum depth around a central pixel
min_bathy[is.infinite(min_bathy)]<- NA #Replace Inf with NA
relief<- bathy-min_bathy #Calculate vertical relief
relief_qrules<- matrix(c(-Inf,1,2, 1, 2, Inf, 1,2,3), nrow = 3,ncol = 3)
relief_classified<- raster::reclassify(relief, relief_qrules, include.lowest=FALSE, right=TRUE)
relief_classified<- as.factor(relief_classified)
levels(relief_classified)[[1]]$Relief_Class<- c("Low","Mod", "High")
Relief_Map_Plot<- tm_shape(relief, raster.downsample = FALSE)+
tm_raster(palette=pal1, midpoint = NA, style="cont", title = "Relief (m)")+
tm_graticules(lines=FALSE)
Relief_Classified_Plot<- tm_shape(st_as_stars(relief_classified, ignore_file=TRUE), raster.downsample = FALSE)+
tm_raster(palette=c("gray", "yellow", "red"), title = "Relief Class")+
tm_graticules(lines=FALSE)
tmap_arrange(Relief_Map_Plot, Relief_Classified_Plot)

Figure S1.15
sub_relief<- (sup_hab*10)+relief_classified
sub_relief[sub_relief==22]<-21
sub_relief[sub_relief==23]<-21 #Collapse sand to one category
sub_relief<- as.factor(sub_relief)
names(sub_relief)<- "Habitat"
levels(sub_relief)[[1]]$habitat<- c("LR Rock", "MR Rock", "HR Rock", "Sand")
sub_relief_plot<- tm_shape(droplevels(st_as_stars(sub_relief, ignore_file=TRUE), exclude=NA), raster.downsample = FALSE)+
tm_raster(palette = c("dodgerblue", "orange", "red", "#FFEBBE"))+
tm_graticules(lines=FALSE)
sub_relief_plot

---
title: "Elbow Mapping Analyses"
author: "Alexander Ilich"
date: "`r format(Sys.time(), '%B %d, %Y')`"
output:
  html_notebook:
    fig_width: 6
    fig_height: 9
    df_print: paged
editor_options:
  chunk_output_type: inline
---

```{r setup, include=FALSE}
knitr::opts_knit$set(root.dir = normalizePath(".."))
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, cache = FALSE, dpi = 300, fig.width = 6, fig.height = 9) #Default chunk options
installed_packages<- installed.packages()
needed_packages<- c("caret", "RColorBrewer", "parallel", "snow", "ranger", "Boruta", "lubridate", "rgdal", "sp", "raster", "stars", "tmap", "RStoolbox",
                    "tidyverse", "readxl", "spdplyr", "colorRamps", "devtools")

packages_to_install<- needed_packages[!(needed_packages %in% installed_packages)]
if(length(packages_to_install)>0){
  install.packages(packages_to_install)
  }
if(!("mytools" %in% installed_packages)){
  devtools::install_github("ailich/mytools@v0.31") 
  } 
if(!("GLCMTextures" %in% installed_packages)){
  devtools::install_github("ailich/GLCMTextures@0.1") 
  } #Install

library(caret) #Modelling
library(RColorBrewer) #Color Palettes
library(parallel) #Parallel Processing
library(snow) #Parallel Processing
library(ranger) #For Random Forest Model
library(Boruta) #Variable Selection
library(lubridate) #Time Data
library(rgdal) #Spatial Data
library(sp) #Spatial data
library(raster) #Spatial Data
library(stars) #Spatial data
library(tmap) #Spatial Plotting
library(RStoolbox) #Spatial Modelling
library(tidyverse) #Data Wrangling
library(readxl) #read excel sheets
library(spdplyr) #Data Wrangling for spatial data
library(GLCMTextures) #GLCM Texture Metrics
library(mytools) #My functions
options(stringsAsFactors = FALSE)
Sys.setenv(TZ='UTC') #Set time to display in UTC
options(digits.secs= 6) #Time digits displayed and written to 6 places
rasterOptions(addheader = "ENVI") #Preserves names when writing raster stacks to file
```

```{r warning=TRUE, include=FALSE}
#mytools and GLCMTextures are packages I created. mytools is assorted functions I have made over time and GLCMTextures calculates texture metrics
if(packageVersion("mytools")!="0.3"){
  warning("mytools is a different version than the one I used to run this script and may produce different results")
}
if(packageVersion("GLCMTextures")!="0.0.1.0"){
  warning("GLCMTextures is a different version than the one I used to run this script and may produce different results")
}
```

```{r eval=FALSE, include=FALSE}
#Note to load the environment as it would be at the end of the script without running it, run this line of code
load("Data/R_environment/Elbow_R_Mapping_Environment.RData")
```

```{r Define Functions, include=FALSE}
#Function for parsing raw data into 1Hz table
make_1hz<- function(alt_path, compass_path, ctd_path, Lat_Long, Ship_Speed, cable_out){
  Lat_Long<- Lat_Long %>% mutate(timestamp= round_date(timestamp, unit = "second")) %>%
    group_by(timestamp) %>%
    summarize(ShipLon=mean(ShipLon, na.rm=TRUE), ShipLat=mean(ShipLat, na.rm=TRUE), .groups="drop")

  cable_out<- cable_out %>% mutate(timestamp= round_date(timestamp + dmicroseconds(u_second), unit = "second")) %>%
    group_by(timestamp) %>%
    summarize(payout=mean(payout, na.rm=TRUE), .groups="drop")

  Ship_Speed<- Ship_Speed %>% mutate(Ship_Speed_mps=Speed_kph*(1000/(60*60))) #Convert to meters per second
  Ship_Speed<- Ship_Speed %>% mutate(timestamp= round_date(timestamp, unit = "second")) %>%
    group_by(timestamp) %>%
    summarize(Ship_Speed_mps=mean(Ship_Speed_mps, na.rm=TRUE), .groups="drop")

  alt<- read_tsv(alt_path, col_types = list(.default=col_double(), timestamp=col_datetime()))
  compass<- read_tsv(compass_path, col_types = list(.default=col_double(), timestamp=col_datetime()))
  ctd<- read_tsv(ctd_path, col_types = list(.default=col_double(), timestamp=col_datetime()))

  alt$altitude[alt$altitude>=50]<- NA #50 is max value of altimeter 60 any value 50 or greater is recorded as 50
  alt<- alt %>% mutate(timestamp=round_date(timestamp+dmicroseconds(u_second), unit = "second")) %>%
    group_by(timestamp) %>%
    summarize(altitude=median(altitude, na.rm=TRUE), .groups="drop")

  compass<- compass %>% mutate(timestamp=round_date(timestamp+dmicroseconds(u_second), unit = "second")) %>%
    group_by(timestamp) %>%
    summarize(heading= median(heading, na.rm = TRUE), pitch=median(pitch, na.rm=TRUE), roll=median(roll, na.rm=TRUE), .groups="drop")

  ctd<- ctd %>% mutate(timestamp=round_date(timestamp+dmicroseconds(u_second), unit = "second")) %>%
    group_by(timestamp) %>%
    summarize(temperature= mean(temperature, na.rm=TRUE), depth=mean(depth, na.rm=TRUE), salinity= mean(salinity, na.rm=TRUE), .groups="drop")

  one_hz<- tibble(timestamp=seq.POSIXt(from =min(c(alt$timestamp, compass$timestamp, ctd$timestamp))-dseconds(180), to = max(c(alt$timestamp, compass$timestamp, ctd$timestamp)), by = "sec"))

  one_hz<- one_hz %>%
    left_join(Lat_Long, by="timestamp") %>%
    left_join(Ship_Speed, by="timestamp") %>%
    left_join(alt, by="timestamp") %>%
    left_join(compass, by="timestamp") %>%
    left_join(ctd, by="timestamp") %>%
    left_join(cable_out, by="timestamp")

  one_hz$ShipLon<- interp(one_hz$ShipLon, na.rm = TRUE)
  one_hz$ShipLat<- interp(one_hz$ShipLat, na.rm = TRUE)
  one_hz$altitude<- interp(one_hz$altitude, na.rm = TRUE)
  one_hz$altitude[180]<- NA_real_
  one_hz$pitch<- interp(one_hz$pitch, na.rm = TRUE)
  one_hz$pitch[180]<- NA_real_
  one_hz$depth<- interp(one_hz$depth, na.rm = TRUE)
  one_hz$depth[180]<- NA_real_
  one_hz$payout<- interp(one_hz$payout, na.rm = TRUE)

  one_hz<- one_hz %>% mutate(Ship_Speed_mps_1minAvg=NA_real_)
  for (i in 60:nrow(one_hz)) {
    one_hz$Ship_Speed_mps_1minAvg[i]<- mean(one_hz$Ship_Speed_mps[(i-59):i], na.rm=TRUE)
  } #1 minute rolling average
  return(one_hz)
}

fib_seq<- function(n){
  fib<- rep(NA_real_, n)
  fib[1:2]<- c(0,1)
  for (i in 3:length(fib)) {
  fib[i]<- fib[i-1]+fib[i-2]
  }
  return(fib)}

extract_ranger_oob<- function(obj, trainingData, response_col){
  if(class(obj)[1]=="superClass"){
    obj<- obj$model}
  
  if(class(obj)[1]=="train"){
    trainingData<- obj$trainingData
    response_col<- ".outcome"
    final_mod<- obj$finalModel
  } else{
    final_mod<- obj}
  n_trees<- final_mod$num.trees
  n_classes<- length(unique(trainingData[[response_col]]))
  
  Tree_OOB<- data.frame(Tree= rep(1:n_trees, each= n_classes+1), OOB = NA_real_)
  j<-1
  for (i in 1:n_trees) {
    idx<- final_mod$inbag.counts[[i]]==0
    OOB_data<- trainingData[idx,]
    OOB<- stats::predict(final_mod, OOB_data)
    OOB_prediction<- as.data.frame(OOB$predictions)
    
    if(ncol(OOB_prediction)>1){
      OOB_prediction<- OOB_prediction %>% mutate(ID=1:nrow(all_of(OOB_prediction)))
      OOB_prediction<- OOB_prediction %>% pivot_longer(-ID)
      OOB_prediction<- OOB_prediction %>% group_by(ID) %>% filter(value==max(value)) %>% sample_n(1) %>% ungroup() %>% arrange(ID)
      OOB_prediction<- OOB_prediction %>% select(name)
      names(OOB_prediction)<- "V1"
      OOB_prediction<- OOB_prediction$V1
      } #Reformat if was in probaility instead of classification mode

    Observed_vs_Predicted<- data.frame(Observed= OOB_data[[response_col]],
                                       Predicted= OOB_prediction) 
    Tree_OOB$OOB[j]<- sum(Observed_vs_Predicted$Observed!=Observed_vs_Predicted$Predicted)/nrow(Observed_vs_Predicted)
    Tree_OOB$Type[j]<- "Overall"
    by_classOOB<- Observed_vs_Predicted %>% group_by(Observed, .drop=FALSE) %>% summarize(OOB= sum(Observed!=Predicted)/n(), .groups="drop")
    Tree_OOB$OOB[(j+1):(j+n_classes)]<- by_classOOB$OOB
    Tree_OOB$Type[(j+1):(j+n_classes)]<- as.character(by_classOOB$Observed)
    j<- j+n_classes+1
  }
  Tree_OOB<- Tree_OOB %>% pivot_wider(names_from = Type, values_from=OOB)
  Mean_OOB<- data.frame(ntrees= 1:n_trees)
  for (k in 2:ncol(Tree_OOB)) {
    NA_idx<- is.na(Tree_OOB[,k])
    Mean_OOB_temp<- Tree_OOB[!NA_idx,c(1,k)]
    Mean_OOB_temp[,2]<- cummean(as.data.frame(Mean_OOB_temp)[,2])
    Mean_OOB<- Mean_OOB %>% left_join(Mean_OOB_temp, by=c("ntrees"="Tree"))
  }
  return(Mean_OOB)
}
```

# Study Area
```{r include=FALSE}
FL_shp<- readOGR("Data/derived/shapefiles/Florida_Shoreline__1_to_2%2C000%2C000_Scale_.shp") #Source: https://geodata.myfwc.com/datasets/florida-shoreline-1-to-2000000-scale?geometry=-102.366%2C24.356%2C-65.298%2C31.156
EL_shp<- readOGR("Data/derived/shapefiles/EL_Shp.shp")
```
The study area is marked in red, and is know as "The Elbow." It is about 145 km west-northwest of Tampa Bay, Florida. The area is a popular offshore fishing area that contains contains hard bottom ridges and is hypothesized to be a paleoshoreline shaped by wave action approximately 12,000 years ago.

Figure S1.1

```{r echo=FALSE, fig.width=6, fig.height=6}
tm_shape(FL_shp)+
  tm_fill()+
  tm_shape(EL_shp)+
  tm_fill(col="red")+
  tm_graticules(lines=FALSE)+
  tm_credits("https://geodata.myfwc.com/datasets", position = c("left", "bottom"))
```

# Match Bathymetry and Raster Grids

## Read in Data

Bathymetry
```{r }
bathy_og<- raster("Data/raw/Multibeam/ElbowDec2015Cube_2m.tiff") #Original bathymetry
```

```{r echo=FALSE}
bathy_og
```
Backscatter
```{r }
backscatter_og<- raster("Data/raw/Multibeam/EL1_1mTimeSeriesBS_TrimmedtoMosaic_AVG800.tiff")
```

```{r echo=FALSE}
backscatter_og
```
```{r eval=FALSE, message=FALSE, results='hide'}
EL_shp<- readOGR("Data/derived/shapefiles/EL_Shp.shp")
```
```{r echo=FALSE}
print(summary(EL_shp)$class)
print(paste("Extent:", paste(as.character(round(as.vector(t(summary(EL_shp)$bbox)))), collapse=" "), "(xmin, xmax, ymin, ymax)"))
print(paste("crs:", summary(EL_shp)$proj4string))
```

## Resample to macthing 10m grid
```{r }
bathy<- projectRaster(from = bathy_og, res = 10, crs = make_crs("UTM 16N")) #Reproject/resample
bathy<- focal(bathy, w=matrix(data = 1, nrow = 3, ncol = 3), fun=mean, na.rm=TRUE, NAonly=TRUE) #Fill holes   
names(bathy)<- "bathy"
backscatter<- projectRaster(from= backscatter_og, to =bathy) #Reproject/resample
backscatter<- focal(backscatter, w=matrix(data = 1, nrow = 3, ncol = 3), fun=mean, na.rm=TRUE, NAonly=TRUE) #Fill holes   
names(backscatter)<- "backscatter"
```

Trim data to mask
```{r }
bathy<- mask(x = bathy, EL_shp)
backscatter<- mask(x = backscatter, EL_shp)
```

## Finalized Surfaces

Figure S1.2

```{r echo=FALSE, fig.width=10, include=TRUE}
pal1<- colorRamps::matlab.like(100)

bathy_plot<- tm_shape(bathy, raster.downsample = FALSE) +
  tm_raster(palette = pal1, style= "cont", title = "Bathymetry (m)")+
  tm_compass(type = "8star", position = c("right", "top"))+
  tm_scale_bar(breaks= c(0, 2.5, 5), text.size=1)+
  tm_credits(text = "a)", position = c("left", "bottom"), size = 1)+
  tm_graticules(lines=FALSE)

backscatter_plot<- tm_shape(backscatter, raster.downsample = FALSE)+
  tm_raster(palette = grey.colors(100), style= "cont", title = "Backscatter Intensity (dB)")+
  tm_compass(type = "8star", position = c("right", "top"))+
  tm_scale_bar(breaks= c(0, 2.5, 5), text.size=1)+
  tm_credits(text = "b)", position = c("left", "bottom"), size = 1)+
  tm_graticules(lines=FALSE)

tmap_arrange(bathy_plot, backscatter_plot)
```

```{r include=FALSE}
writeRaster(x = bathy, filename = "Data/derived/Multibeam/bathy.tif", overwrite=TRUE)
writeRaster(x = bathy, filename = "Data/derived/Multibeam/bathy.grd", overwrite=TRUE)
bathy<- raster("Data/derived/Multibeam/bathy.grd")
writeRaster(x = backscatter, filename = "Data/derived/Multibeam/backscatter.tif", overwrite=TRUE)
writeRaster(x = backscatter, filename = "Data/derived/Multibeam/backscatter.grd", overwrite=TRUE)
backscatter<- raster("Data/derived/Multibeam/backscatter.grd")
```

# Calculate Derivative Features

## Determine scales of analysis
The Fibonacci sequence is a good way to systematically cover multiple-scales (Wilson, 2007).
```{r}
my_window_sizes<- 1+ (2*fib_seq(10)[-c(1,2)])
print(my_window_sizes)
```
We will evaluate predictors at `r length(my_window_sizes)` different window sizes from `r min(my_window_sizes)`x`r min(my_window_sizes)` to `r max(my_window_sizes)`x`r max(my_window_sizes)`

## Bathymetric Derivative Features
Calculate bathymetric derivatives at various scales of analysis. This includes the slope, aspect (split into Northness and Eastness), Topographic Position Index (Whether an area is a local high or low), mean bathymetry, and standard deviation of bathymetry (a measure of rugosity).

```{r results='hide'}
slope<- terrain(bathy, opt= "slope", unit = "degrees")
names(slope)<- "slope_native"
aspect<- terrain(bathy, opt = c("aspect"), unit = "radians")
eastness<- sin(aspect)
names(eastness)<- "eastness_native"
northness<- cos(aspect)
names(northness)<- "northness_native"

for (i in my_window_sizes) {
  print(i)
  w<- matrix(data = 1, nrow = i, ncol = i)
  
  curr_slope<- focal(x = slope$slope_native, w, fun=mean, na.rm=TRUE, pad=TRUE)
  names(curr_slope)<- paste0("slope_",as.character(i),"x", as.character(i))
  slope<- addLayer(slope, curr_slope)
  
  curr_eastness<- focal(x = eastness$eastness_native, w, fun=mean, na.rm=TRUE, pad=TRUE)
  names(curr_eastness)<- paste0("eastness_",as.character(i),"x", as.character(i))
  eastness<- addLayer(eastness, curr_eastness)
  
  curr_northness<- focal(x = northness$northness_native, w, fun=mean, na.rm=TRUE, pad=TRUE)
  names(curr_northness)<- paste0("northness_",as.character(i),"x", as.character(i))
  northness<- addLayer(northness, curr_northness)
  
  curr_tpi<- TPI(bathy, d=c(i,i), na.rm=TRUE, pad = TRUE)
  names(curr_tpi)<-  paste0("tpi_",as.character(i),"x", as.character(i))
  
  curr_BathyMean<- focal(x = bathy, w, fun=mean, na.rm=TRUE, pad=TRUE)
  names(curr_BathyMean)<-  paste0("BathyMean_",as.character(i),"x", as.character(i))
  
  curr_BathySD<- focal(x = bathy, w, fun=sd, na.rm=TRUE, pad=TRUE)
  names(curr_BathySD)<-  paste0("BathySD_",as.character(i),"x", as.character(i))

  if(i==3){
    tpi<- curr_tpi
    BathyMean<- curr_BathyMean
    BathySD<- curr_BathySD 
    } else{
      tpi<- addLayer(tpi, curr_tpi)
      BathyMean<- addLayer(BathyMean, curr_BathyMean)
      BathySD<- addLayer(BathySD, curr_BathySD)
    }
}

bathy_deriv<- stack(BathyMean, BathySD, tpi, slope, eastness, northness)
bathy_deriv<- mask(x = bathy_deriv, EL_shp) #Trim back down b/c used pad and na.rm
```

```{r include=FALSE}
writeRaster(bathy_deriv, "Data/derived/Multibeam/bathy_deriv.grd", overwrite=TRUE)
rm(slope, aspect, eastness, northness, curr_slope, curr_eastness, curr_northness, curr_tpi, curr_BathyMean, curr_BathySD, tpi, BathyMean, BathySD)
bathy_deriv<- brick("Data/derived/Multibeam/bathy_deriv.grd")
```

## Backscatter Derivative Features
Calculate backscatter derivatives at various scales of analysis. This includes the mean and standard deviation of bathymetry as well as texture metric derived from a Gray Level Co-occurence Matrix (GLCM). 

The most common texture metrics from remote sensing were derived using the formulas from (Hall-Beyer 2017). Rotationally Invariant/Directionally Isotropic Harilick Texture Metrics were derived by calculating a symetrical GLCM with 32 gray levels.
The texture metrics can be broken down into three groups: The contrast group (contrast, dissimilarity, and homogeneity), the orderliness group (ASM and Entropy), and the descriptive statistics group (mean, variance, and correlation;Hall-Beyer, 2017). At smaller window sizes, the correlation texture can sometimes be undefined (0/0) leading to holes in the resulting raster surfaces, so correlation at 3x3, 5x5, and 7x7 were removed from the analysis to prevent this.

```{r cachce=TRUE, results='hide'}
backscatter_quant<- quantize_raster(backscatter, n_levels = 32,method = "equal prob")
for (i in my_window_sizes) {
  print(i)
  w<- matrix(data = 1, nrow = i, ncol = i)
  
  curr_bs_mean<- focal(backscatter, w, fun=mean, na.rm=TRUE, pad=TRUE)
  names(curr_bs_mean)<- "BackscatterMean"
  curr_bs_sd<- focal(backscatter, w, fun=sd, na.rm=TRUE, pad=TRUE) 
  names(curr_bs_sd)<- "BackscatterSD"
  
  curr_BSderiv<- stack(curr_bs_mean, 
                       curr_bs_sd,
                       glcm_textures(backscatter_quant, w= c(i,i), n_levels=32,  shift = list(c(1, 0), c(1, 1), c(0, 1), c(-1, 1)), metrics = c("glcm_contrast", "glcm_dissimilarity", "glcm_homogeneity", "glcm_ASM",
    "glcm_entropy", "glcm_mean", "glcm_variance", "glcm_correlation"), quantization= "none", na_opt = "center", pad = TRUE))
  
  names(curr_BSderiv)<- paste0(names(curr_BSderiv), "_", as.character(i), "x", as.character(i))
  
  if(i==3){
    backscatter_deriv<- curr_BSderiv} else{
      backscatter_deriv<- stack(backscatter_deriv, curr_BSderiv)
      rm(curr_BSderiv)
    }
}
backscatter_deriv<- mask(backscatter_deriv, EL_shp)
backscatter_deriv<- dropLayer(backscatter_deriv, which(names(backscatter_deriv)=="glcm_correlation_3x3"))
backscatter_deriv<- dropLayer(backscatter_deriv, which(names(backscatter_deriv)=="glcm_correlation_5x5"))
backscatter_deriv<- dropLayer(backscatter_deriv, which(names(backscatter_deriv)=="glcm_correlation_7x7")) #remove these b/c have NA's that create holes due to zero in denominator
```

```{r include=FALSE}
writeRaster(backscatter_deriv, "Data/derived/Multibeam/backscatter_deriv.grd", overwrite=TRUE)
rm(backscatter_quant, curr_bs_mean, curr_bs_sd)
backscatter_deriv<- brick("Data/derived/Multibeam/backscatter_deriv.grd")
```

# Georeference Ground-Truth Data

## Transects

```{r echo=FALSE, results='hide'}
T1<- spTransform(readOGR("Data/derived/shapefiles/ELT1_Shiptrack.shp"), CRSobj = make_crs("UTM 16N"))
T3<- spTransform(readOGR("Data/derived/shapefiles/ELT3_Shiptrack.shp"), CRSobj = make_crs("UTM 16N"))
T5<- spTransform(readOGR("Data/derived/shapefiles/ELT5_Shiptrack.shp"), CRSobj = make_crs("UTM 16N"))
T6<- spTransform(readOGR("Data/derived/shapefiles/ELT6_Shiptrack.shp"), CRSobj = make_crs("UTM 16N"))
```

Solid lines represent transects for training data, and dashed line represents the transect where observations were withheld in order to test the accuracy of substrate predictions

Figure S1.3

```{r echo=FALSE}
transect_plot<- tm_shape(bathy, raster.downsample = FALSE)+
  tm_raster(palette = pal1, style= "cont", title = "", legend.show = FALSE)+
  tm_shape(rbind(T3,T5,T6))+
  tm_lines()+
  tm_shape(T1)+
  tm_lines(lty="dashed")+
  tm_graticules(lines=FALSE)
transect_plot
```

## Read in Video Habitat Observations

```{r include=FALSE}
T1_hab<- read_xlsx("Data/raw/CBASS/habitat/T1_D4_quickhab2.xlsx")
T1_hab<- T1_hab[which(T1_hab$Comments=="Bottom"):nrow(T1_hab),]

T3_hab<- read_xlsx("Data/raw/CBASS/habitat/T3_D1_quickhab2.xlsx")
T3_hab<- T3_hab[which(T3_hab$Comments=="Bottom"):nrow(T3_hab),]

T5_hab<- read_xlsx("Data/raw/CBASS/habitat/T5_D14_quickhab2.xlsx")
T5_hab<- T5_hab[which(T5_hab$Comments=="Bottom"):nrow(T5_hab),]

T6_hab<- read_xlsx("Data/raw/CBASS/habitat/T6_D13_quickhab2.xlsx")
T6_hab<- T6_hab[which(T6_hab$Comments=="Bottom"):nrow(T6_hab),]

hab<- bind_rows(T1_hab, T3_hab, T5_hab, T6_hab)
rm(T1_hab,T3_hab,T5_hab, T6_hab)
hab<- hab %>% mutate(Substrate= ifelse(test = is.na(Geo), yes = "Sand", no = "Rock"))
hab$Substrate[hab$Geo=="No_Vis"]<- "No_Vis"
hab<- hab %>% mutate(Relief= ifelse(test = is.na(Geo), yes = "None", no = Geo))
hab<- hab %>% select(Transect, Vid, Sec, Substrate, Relief)
```

```{r echo=FALSE}
hab
```

## Read in Timestamps for Camera Images

```{r include=FALSE}
file_list<- list.files("Data/raw/CBASS/camera", full.names = TRUE)
for (i in 1:length(file_list)) {
  curr_cam<- read_tsv(file_list[i])
  rec<- curr_cam %>% 
    mutate(recording=str_extract(string = file_path, pattern = "\\d{8}-\\d{6}")) %>%
    group_by(recording) %>%
    summarise(COUNT=n()) %>%
    ungroup()
    chosen_recording<- rec$recording[which.max(rec$COUNT)]
    curr_cam<- curr_cam %>% 
      filter(str_detect(string = file_path, pattern = chosen_recording)) %>%
      filter(str_detect(string = file_path, pattern = "\\.jpg$"))
    curr_cam<- curr_cam %>% mutate(Frame_ID= as.numeric(str_extract(str_extract(string = file_path, pattern = "GigEGrabEx-\\d{7}\\.jpg$"), pattern = "\\d{7}")))
    curr_cam<- curr_cam %>% mutate(Transect=str_extract(string = file_list[i], pattern = "T\\d"))
    print(check_jumbled(curr_cam)==FALSE)
    if(i==1){cam<- curr_cam} else{
      cam<- cam %>% bind_rows(curr_cam)}
} #Clean data by removing short recordings not included in mosaiced video and filtering for only hen images were written
cam<- cam %>% group_by(Transect) %>% mutate(Frame_num= Frame_ID-min(Frame_ID)) %>% ungroup() #Sometimes frame filenames were not reset back to zero, so subtract minimum frame of a transect so that you can associate video frame # with an image frame #
cam<- cam %>% select(timestamp, u_second,file_path, Transect, Frame_ID, Frame_num)
write_csv(cam, path = "Data/derived/CBASS/camera.csv")
```

```{r message=FALSE, echo=TRUE}
cam<- read_csv("Data/derived/CBASS/camera.csv")
```
```{r echo=FALSE}
cam
```

## Read in Position and Ancillary Data Streams

```{r include=TRUE, message=FALSE}
Hypack<- read_hypack(list.files("Data/raw/Hypack/", pattern="\\.RAW$", full.names = TRUE))
cable_out<- read_tsv("Data/raw/payout/payout_Feb&Oct2016.tsv", col_types = list(.default=col_double(), timestamp=col_datetime()))
Lat_Long<- get_HypackPos(Hypack)
names(Lat_Long)[2:3]<- c("ShipLon", "ShipLat")
Ship_Speed<- get_HypackSpeed(Hypack)

T1_oneHz<- make_1hz(alt_path = "Data/raw/CBASS/sensors/T1_D4/altimeter_readings.tsv", compass_path = "Data/raw/CBASS/sensors/T1_D4/compass_readings.tsv", ctd_path = "Data/raw/CBASS/sensors/T1_D4/ctd_redo_sal.tsv", Lat_Long = Lat_Long, Ship_Speed = Ship_Speed, cable_out = cable_out)

T3_oneHz<- make_1hz(alt_path = "Data/raw/CBASS/sensors/T3_D1/altimeter_readings.tsv", compass_path = "Data/raw/CBASS/sensors/T3_D1/compass_readings.tsv", ctd_path = "Data/raw/CBASS/sensors/T3_D1/ctd_redo_sal.tsv", Lat_Long = Lat_Long, Ship_Speed = Ship_Speed, cable_out = cable_out)

T5_oneHz<- make_1hz(alt_path = "Data/raw/CBASS/sensors/T5_D14/altimeter_readings.tsv", compass_path = "Data/raw/CBASS/sensors/T5_D14/compass_readings.tsv", ctd_path = "Data/raw/CBASS/sensors/T5_D14/ctd_redo_sal.tsv", Lat_Long = Lat_Long, Ship_Speed = Ship_Speed, cable_out = cable_out)

T6_oneHz<- make_1hz(alt_path = "Data/raw/CBASS/sensors/T6_D13/altimeter_readings.tsv", compass_path = "Data/raw/CBASS/sensors/T6_D13/compass_readings.tsv", ctd_path = "Data/raw/CBASS/sensors/T6_D13/ctd_redo_sal.tsv", Lat_Long = Lat_Long, Ship_Speed = Ship_Speed, cable_out = cable_out)

one_hz<- bind_rows(mutate(T1_oneHz, Transect="T1"),
                mutate(T3_oneHz, Transect="T3"),
                mutate(T5_oneHz, Transect="T5"),
                mutate(T6_oneHz, Transect="T6"))
one_hz
```

## Estimate C-BASS Position

CBASS Position is estimated using Pythagorean theorem using the cable out (the hypotenuse) and CBASS depth (the height of the triangle). The cable out is zeroed when the the CBASS hits the water so to get true cable out from the trawl block, and the true height of this triangle, the height of the trawl block above the water (\~6.1 m) is added to the recorded cable out and CBASS Depth. The layback behind the block is then calculated using pythagorean theorum. The y offset (fore/aft distance; \~24.4 m) between the trawl block and the GPS antenna is then added to this layback to get the layback of the CBASS system behind the logged GPS ship position. CBASS is assumed to follow the ship track but with a time delay. The layback distance is converted to a time delay by dividing the layback distance by the average ship speed over the past minute. To get the CBASS position for a given time, the the ship position from "x" number of seconds ago (the time delay) is grabbed from the table and assigned as the current CBASS position. For more information see the documentation and source code for mytools::calc\_layback (arguments used GPS\_Source="Nothstar", zeroed="water", cat\_fact=1).

```{r warning=FALSE}
one_hz<- one_hz %>% mutate(Layback_m= calc_layback(payout = payout, depth = depth, GPS_Source = "Northstar", zeroed = "water", cat_fact = 1))
one_hz<- one_hz %>% mutate(Layback_sec= round(Layback_m/Ship_Speed_mps_1minAvg))
one_hz<- one_hz %>% mutate(TimeToMatch=timestamp-dseconds(Layback_sec))
temp_one_hz<- one_hz %>% select(timestamp, Transect, ShipLon, ShipLat)
names(temp_one_hz)<- c("TimeToMatch", "Transect", "CBASSLon", "CBASSLat")
one_hz<- one_hz %>% left_join(temp_one_hz, by=c("TimeToMatch","Transect")) %>% select(-TimeToMatch)
rm(temp_one_hz)
one_hz
write_csv(one_hz, "Data/derived/CBASS/onehz.csv")
```

## Georeference Video Habitat Observations

Calculate Frame number. Videos are 1 minute long at 12 frames per second.

```{r}
hab<- hab %>% mutate(Frame_num= (Vid * 12 * 60) + Sec*12)
```

Get timestamp for each frame to nearest second

```{r}
hab<- hab %>% left_join(select(cam, timestamp, u_second, Frame_num, Transect), by= c("Transect", "Frame_num"))
hab<- hab %>% mutate(timestamp=round_date(timestamp + dmicroseconds(u_second), unit = "second")) %>% select(-u_second)
```

Link to position by timestamp

```{r include=TRUE}
hab<- hab %>% left_join(select(one_hz, timestamp, Transect, CBASSLat, CBASSLon), by=c("timestamp", "Transect"))
```
```{r echo=TRUE}
hab
```
```{r}
write_csv(hab, path = "Data/derived/CBASS/hab.csv")
```

## Prepare training and validation set

Only include habitats that were the same as their previous and subsequent observations to remove transitional/mixed areas to create set used for training and validation.

```{r}
TV_set<- hab %>% group_by(Transect) %>% const_hab("Substrate") %>% ungroup()
print(unique(TV_set$Substrate))
TV_set<- TV_set %>% filter(Substrate!="No_Vis") #Remove bad visibility
TV_set$Substrate<- factor(TV_set$Substrate, levels = c("Rock", "Sand"))
TV_set<- TV_set %>% mutate(Set=ifelse(test = Transect=="T1", yes = "Validation", no = "Training")) #Reserve T1 for validation of results. Use other transects for training data
```

### Create Spatial Object

```{r}
TV_set<- TV_set %>% filter(!is.na(CBASSLat)) #Remove where there is no positioning data
TV_set<- spTransform(SpatialPointsDataFrame(coords = cbind(TV_set$CBASSLon, TV_set$CBASSLat),
                                data = as.data.frame(select(TV_set, -c(CBASSLon, CBASSLat))), 
                                proj4string = make_crs("WGS84")), CRSobj = make_crs("UTM 16N"))
TV_set<- TV_set[EL_shp,] #Remove points outside beyond bounds of the study area
```

The ground-truth dataset consisted of `r nrow(hab)` observations, where each observation was the substrate classification determined from the video at 15 second intervals. This consisted of `r sum(hab$Substrate=="Rock")` observations of rock, `r sum(hab$Substrate=="Sand")` observations of sand, and `r sum(hab$Substrate=="No_Vis")` observations where substrate was not discernible. After censoring substrate determinations that differed from their previous and subsequent observations, those where substrate was not visible, and observations beyond the bounds of the multibeam survey or did not have associated positioning data, there were `r TV_set %>% filter(Substrate=="Rock") %>% nrow()` observations of rock and `r TV_set %>% filter(Substrate=="Sand") %>% nrow()` observations of sand. These data were then split into training and validation sets. The training data set consisted of `r TV_set %>% filter(Set=="Training" & Substrate=="Rock") %>% nrow()`observations of rock and `r TV_set %>% filter(Set=="Training" & Substrate=="Sand") %>% nrow()` observations of sand. The validation transect consisted of `r TV_set %>% filter(Set=="Validation" & Substrate=="Rock") %>% nrow()` observations of rock and `r TV_set %>% filter(Set=="Validation" & Substrate=="Sand") %>% nrow()` observations of sand.

Figure S1.4

```{r echo=FALSE}
training_plot<- tm_shape(EL_shp) +
  tm_polygons(title = "Training", col = "cadetblue1", legend.show = FALSE)+
tm_shape(filter(TV_set, Set=="Training"))+
  tm_dots(col="Substrate", palette=c("red", "#FFEBBE"), legend.show = TRUE)+
  tm_layout(legend.bg.color = "gray")+
    tm_credits(text = "a)", position = c("right", "bottom"), size = 1)+
   tm_graticules(lines=FALSE)+
    tm_layout(main.title = "Training", main.title.position = .43) 

validation_plot<- tm_shape(EL_shp) +
  tm_polygons(title = "Validation", col = "cadetblue1", legend.show = FALSE)+
  tm_shape(filter(TV_set, Set=="Validation"))+
  tm_dots(col="Substrate", palette=c("red", "#FFEBBE"), legend.show = FALSE)+
  tm_credits(text = "b)", position = c("right", "bottom"), size = 1)+
   tm_graticules(lines=FALSE)+
    tm_layout(main.title = "Validation", main.title.position = .39) 


gtruth_plot<- tmap_arrange(training_plot, validation_plot)
gtruth_plot
```

```{r}
hab_onehz_joined<- hab %>% left_join(one_hz)
```
 
Transects total to `r hab_onehz_joined %>% group_by(Transect) %>% summarize(st_time=min(timestamp, na.rm=TRUE), end_time=max(timestamp, na.rm=TRUE), avg_speed_mps= mean(Ship_Speed_mps, na.rm=TRUE), .groups="drop") %>% mutate(transect_duration=as.numeric(difftime(end_time, st_time, units = "secs"))) %>% mutate(transect_lenth_km= (transect_duration*avg_speed_mps)/1000) %>% pull(transect_lenth_km) %>% sum() %>% round()` km in length


# Predictive Habitat Mapping

```{r}
predictor_vars<- stack(bathy, bathy_deriv, backscatter, backscatter_deriv)
n_predictors<-nlayers(predictor_vars)
```

```{r include=FALSE}
writeRaster(predictor_vars, "Data/derived/Multibeam/predictor_vars.grd",overwrite=TRUE)
predictor_vars<- brick("Data/derived/Multibeam/predictor_vars.grd")
```

## Supervised Classification (Random Forest)

### Variable Selection
There are `r n_predictors` predictor variables in total. To decide on which variables to retain in the final model, we first use the Boruta feature selection algorithm, which through an iterative procedure determines whether the importance of a predictor is significantly greater (p_bonferroni < 0.05) than zero. Any predictors that do not have an importance significantly greater than zero are removed from the model. We then reduce co-linearity among predictors and remove redundant variables by eliminating any predictor if it is highly correlated (|r| > 0.8) with any variable that had a higher variable importance score.

```{r }
data_df<- raster::extract(x = predictor_vars, y = filter(TV_set, Set=="Training"), method="simple", factors=TRUE, sp=TRUE)@data
data_df$Substrate<- factor(data_df$Substrate, levels=c("Rock", "Sand"))
sample_fraction<- data_df %>% count(Substrate) %>% mutate(prop=n/sum(n)) %>% pull(prop) %>% min() #Downsample to minority class
data_df<- data_df %>% select(-c(Transect, Vid, Sec, timestamp, Relief, Frame_num, Set)) %>% na.omit() 

set.seed(4)
Boruta_obj<- Boruta(Substrate ~ . , data=data_df, pValue = .05, mcAdj = TRUE, maxRuns = 5000, getImp= getImpRfRaw, sample.fraction=c(sample_fraction,sample_fraction))
Boruta_obj<- TentativeRoughFix(x = Boruta_obj)
Predictor_Decisions<-data.frame(Decision = Boruta_obj[["finalDecision"]])
Predictor_Decisions<- Predictor_Decisions %>% mutate(Variable= rownames(Predictor_Decisions)) %>% select(Variable, Decision)
Boruta_Imp<- as.data.frame(Boruta_obj[["ImpHistory"]]) %>% summarize(across(everything(), .fns=mean)) %>% pivot_longer(cols = everything(), names_to = "Variable", values_to= "Importance")
sup_retained_predictors_df<- Predictor_Decisions %>% filter(Decision=="Confirmed") %>% left_join(Boruta_Imp, by="Variable") %>% arrange(desc(Importance))

sup_retained_predictors<- predictor_vars[[which(names(predictor_vars) %in% sup_retained_predictors_df$Variable)]]
corr_vals<- layerStats(sup_retained_predictors, 'pearson', na.rm=TRUE)
corr_vals<- as.data.frame(corr_vals$`pearson correlation coefficient`) %>% mutate(V1=row.names(corr_vals$`pearson correlation coefficient`))
corr_vals<- corr_vals %>% pivot_longer(cols= -V1, names_to = "V2", values_to="correlation")
corr_vals<- corr_vals %>% filter(V1!=V2)
sup_retained_predictors_df<- sup_retained_predictors_df %>% mutate(Keep=TRUE)
for (i in nrow(sup_retained_predictors_df):2) {
  curr_var<- sup_retained_predictors_df$Variable[i]
  more_important_predictors<- sup_retained_predictors_df[1:(i-1), "Variable"]
  max_corr<- corr_vals %>% filter((V1==curr_var) & (V2 %in% more_important_predictors)) %>% pull(correlation) %>% abs() %>% max()
  if(max_corr > 0.8){sup_retained_predictors_df$Keep[i]<- FALSE}
  }
sup_retained_predictors_df<- sup_retained_predictors_df %>% filter(Keep==TRUE)
sup_retained_predictors<- sup_retained_predictors[[which(names(sup_retained_predictors) %in% sup_retained_predictors_df$Variable)]]
```

```{r include=FALSE}
writeRaster(sup_retained_predictors, "Data/derived/Multibeam/retained_predictors.grd",overwrite=TRUE)
sup_retained_predictors<- brick("Data/derived/Multibeam/retained_predictors.grd")
```

### Plot of Retained Variables

Figure S1.5

```{r echo=FALSE, fig.height=10, fig.width=8, message=FALSE}
sup_retained_var_plot_list<- vector(mode="list", length = nlayers(sup_retained_predictors))
sup_retained_predictors_stars<- st_as_stars(sup_retained_predictors, ignore_file=TRUE)
for (i in 1:length(sup_retained_var_plot_list)) {
  curr_var<- names(sup_retained_predictors)[i]
  if (grepl(pattern = "(^northness)|(^eastness)|(correlation)", curr_var)) {
    curr_pal<- colorRampPalette(c("blue", "gray", "red"))(100)
    breaks<- c(-1,0,1)
    midpoint<- 0
  } else if(grepl(pattern = "(^tpi)", curr_var)) {
    curr_pal<- colorRampPalette(c("blue", "gray", "red"))(100)
    breaks<- c(floor(minValue(sup_retained_predictors[[i]])), 0, ceiling(maxValue(sup_retained_predictors[[i]])))
    midpoint<- 0
    } else{
    curr_pal<- pal1
    breaks<- NULL
    midpoint<- NULL
  }
  curr_stars<- sup_retained_predictors_stars[,,,i]
  sup_retained_var_plot_list[[i]]<- tm_shape(curr_stars, raster.downsample = FALSE) +
  tm_raster(palette = curr_pal, style= "cont", title = "", breaks = breaks, midpoint = midpoint)+
    tm_layout(main.title = curr_var, 
    main.title.position = "center",
    main.title.size=0.75)
}
sup_retained_var_plot<- tmap_arrange(sup_retained_var_plot_list, ncol=4)
sup_retained_var_plot
```

### Fit and run model

```{r message=FALSE, warning=FALSE, results='hide'}
#Fit and predict using random forest
n_retained_pred<- nlayers(sup_retained_predictors)
beginCluster(detectCores(logical = FALSE)-1)
set.seed(5)
sup_model<- superClass(img =sup_retained_predictors, trainData = filter(TV_set, Set=="Training"), responseCol = "Substrate", model = "ranger", mode = "classification", valData = filter(TV_set, Set=="Validation"), predict = FALSE, predType = "prob", kfold=5, minDist = 0, metric= "Kappa", num.trees=3000, tuneGrid = data.frame(mtry=2:(n_retained_pred-1), min.node.size=1, splitrule="gini"), keep.inbag = TRUE, sample.fraction= c(sample_fraction,sample_fraction), scale.permutation.importance = FALSE, importance= "permutation", local.importance = TRUE, verbose = FALSE)
endCluster()

sup_hab<- RStoolbox:::predict.superClass(object = sup_model, img = sup_retained_predictors, predType="raw")
names(sup_hab)<- "Substrate"
sup_hab<- as.factor(sup_hab)
levels(sup_hab)[[1]]$Substrate<- c("Rock", "Sand")

#Uncertainty
sup_hab_prob<- RStoolbox:::predict.superClass(sup_model, sup_retained_predictors, predType = "prob")
names(sup_hab_prob)<- c("Rock", "Sand")
sup_hab_entropy<- mytools::calc_entropy(sup_hab_prob)
names(sup_hab_entropy)<- "Shannon_Entropy"
```

```{r include=FALSE}
writeRaster(sup_hab, filename = "Products/Mapping/Elbow_Substrate_Supervised.tif", overwrite=TRUE)
writeRaster(sup_hab, filename = "Products/Mapping/Elbow_Substrate_Supervised.grd", overwrite=TRUE)
sup_hab<- raster("Products/Mapping/Elbow_Substrate_Supervised.grd")

writeRaster(sup_hab_prob, filename = "Products/Mapping/Elbow_Substrate_Supervised_Prob.grd", overwrite=TRUE)
sup_hab_prob<- brick("Products/Mapping/Elbow_Substrate_Supervised_Prob.grd")

writeRaster(sup_hab_entropy, filename = "Products/Mapping/Elbow_Substrate_Supervised_Entropy.tif", overwrite=TRUE)
writeRaster(sup_hab_entropy, filename = "Products/Mapping/Elbow_Substrate_Supervised_Entropy.grd", overwrite=TRUE)
sup_hab_entropy<- raster("Products/Mapping/Elbow_Substrate_Supervised_Entropy.grd")
```

### Model Diagnostics
Model diagnostics showing that the overall Out-Of-Bag error rate plateaus, meaning that a sufficient number of decision trees have been used in the model

Figure S1.6

```{r echo=FALSE}
OOB_df<- extract_ranger_oob(sup_model)
err_vs_ntree<- ggplot(data = OOB_df, aes(x=ntrees, y=Overall))+
  geom_line()+ylab("Error")+ggtitle("Out-Of-Bag Error vs Number of Trees")+xlab("Number of Trees")+
  scale_y_continuous(labels=scales::percent_format())+
  theme(plot.title = element_text(hjust=.5))
```
```{r echo=FALSE, , fig.height=4, fig.width=6}
err_vs_ntree
```


### Resulting Maps

The left panel shows the predicted substrate across the study area based on our supervised classification model. In addition to the predictions, the random forest model allows for an entropy map which plots the uncertainty associated with the classification for each pixel by quantifying the level of disagreement among the all the different decision trees in our random forest model.

Figure S1.7

```{r echo=FALSE, fig.width=8, fig.height=8}
sup_sub_plot<- tm_shape(st_as_stars(sup_hab, ignore_file=TRUE), raster.downsample = FALSE)+
  tm_raster(col="Substrate", palette = c("red" , "#FFEBBE"), title = "Supervised Substrate", style = "cat")+
  tm_credits(text = "a)", position = c("right", "top"), size = 1)+
   tm_graticules(lines=FALSE)

ent_pal<- colorRampPalette(c("blue", "yellow", "red"))
sup_ent_plot<- tm_shape(sup_hab_entropy, raster.downsample = FALSE)+
  tm_raster(palette=ent_pal(100), style = "cont", title = "Shannon Entropy")+
    tm_credits(text = "b)", position = c("right", "top"), size = 1)+
   tm_graticules(lines=FALSE)#Entropy Map to look at uncertainty

tmap_arrange(sup_sub_plot, sup_ent_plot)
```


```{r echo=FALSE}
sup_area<- as.data.frame(freq(sup_hab, useNA="no", merge=TRUE))
sup_area$Substrate=c("Rock", "Sand")
sup_area<- sup_area %>% mutate(Area_m2= count*(res(all_of(sup_hab))[1])*(res(all_of(sup_hab))[2]))
sup_area<- sup_area %>% mutate(Area_km2=Area_m2/1e6)
sup_area %>% select(Substrate, Area_km2) %>% mutate(Area_km2=round(Area_km2,2))
```


### Variable Importance

Figure S1.8
```{r echo=FALSE, fig.width=6, fig.height=4}
var_imp<- as.data.frame(sup_model$model$finalModel$variable.importance)
names(var_imp)<- "MDA"
var_imp$Predictor<- rownames(var_imp)
var_imp<- var_imp %>% arrange(desc(MDA))

var_imp_plot<- ggplot(data = var_imp, mapping = aes(x=reorder(Predictor, MDA), y= MDA)) +
  geom_bar(stat="identity")+
  xlab("Variable") +
  ylab("Mean Decrease in Accuracy")+
  ggtitle("Variable Importance")+
  theme(plot.title = element_text(hjust=.5))+
  scale_y_continuous(labels = scales::percent, expand=c(0,0))+
  coord_flip()
var_imp_plot
```


### Acuraccy Assessment

Accuracy assessment was performed by comparing the predictions to the observed ground-truth habitat observations on the transect that was reserved for validation and not used to fit the model.

```{r echo=FALSE}
print(sup_model$validation$performance$table)
agreement_levels<- data.frame(min_k=c(-Inf, 0, 0.2, .4, .6, .8), max_k=c(0,.2,.4,.6,.8,1), Agreement=c("poor", "slight", "fair", "moderate", "substantial", "almost perfect"))
```
Overall Accuracy = `r round(sup_model$validation$performance$overall[["Accuracy"]] *100,1)`% 

Kappa = `r round(sup_model$validation$performance$overall[["Kappa"]],2)`

A Kappa score of greater than `r agreement_levels$min_k[max(which(sup_model$validation$performance$overall[["Kappa"]]>agreement_levels$min_k))]` indicates `r agreement_levels$Agreement[max(which(sup_model$validation$performance$overall[["Kappa"]]>agreement_levels$min_k))]` agreement between predictions and observations.

## Unsupervied Classification

### Principal Components Analysis

First, we conduct a Principal Component Analysis on Z-score normalized variables (spca=TRUE) to remove co-linearity.

```{r }
PCA_layers<- rasterPCA(predictor_vars, spca = TRUE)
```

```{r include=FALSE}
writeRaster(PCA_layers$map, "Data/derived/Multibeam/PCA_map.grd", overwrite=TRUE)
PCA_layers$map<- brick("Data/derived/Multibeam/PCA_map.grd")
```

```{r include=FALSE}
PCA_comp_df<- tibble(PC=1:nlayers(PCA_layers$map), Observed=PCA_layers$model$sdev^2, Simulated=broken_stick(nlayers(PCA_layers$map))) #dataframe comparing eigenvalues to those of random data
n_pcs<- (which(PCA_comp_df$Simulated>PCA_comp_df$Observed & PCA_comp_df$PC>1)[1])-1
names(n_pcs)<- NULL
```

As each sequential principal component explains a smaller percentage of the variation in the data, next we must determine how many principal components to retain. We will retain only a subset that are determined to be important. There are several methods used to identify the appropriate number of principal components to retain (Jackson, 1993). We compared the variation explained by each Principal Component and the variation expected to be explained by simulated data in which the variance was distributed randomly among the principal components as modeled by a broken-stick distribution (Frontier, 1976). The first `r n_pcs` explained more than could be expected by random chance so the first `r n_pcs` principal components were retained. These first `r n_pcs` principal components explain `r round((100*cumsum(PCA_comp_df$Observed)/sum(PCA_comp_df$Observed))[n_pcs])`% of the variation in the original data.


Figure S1.9

```{r echo=FALSE, warning=FALSE, fig.width=6, fig.height=4}
PCA_var_plot<- ggplot(data = gather(PCA_comp_df, key = "Data", value = "Variance", Observed, Simulated), mapping = aes(x=PC, y = Variance, color=Data))+
  geom_point()+
  geom_line()+
  xlab("Principal Component")+
  ggtitle("Variance Explained vs Principal Component Number")+
  theme(plot.title = element_text(hjust = 0.5))+
  scale_x_continuous(breaks = seq(from = 0, to = max(PCA_comp_df$PC), by=10),limits = c(1,NA))
PCA_var_plot
```

Plot of the retained principal components
Figure S1.10

```{r echo=FALSE, fig.height=10, fig.width=8}
ret_PCA_plot<- tm_shape(st_as_stars(PCA_layers$map[[1:n_pcs]], ignore_file=TRUE), raster.downsample = FALSE)+
  tm_raster(palette = pal1, style= "cont", title = "", midpoint = NA) +
  tm_facets(free.scales = TRUE, ncol = 3)
ret_PCA_plot
```

### Run clustering on the retained principal components

```{r }
beginCluster(detectCores(logical = FALSE)-1)
set.seed(5)
unsup<- unsuperClass(img =PCA_layers$map[[1:n_pcs]], nClasses = 4, nstarts=100, nIter = 1000, clusterMap = FALSE, algorithm = "MacQueen")
endCluster()
names(unsup$map)<- "Acoustic_Cluster"
unsup$map<- as.factor(unsup$map)
levels(unsup$map)[[1]]$Cluster<- c("1", "2", "3", "4")
```

```{r include=FALSE}
writeRaster(unsup$map, filename = "Products/Mapping/Elbow_AcousticClusters_Unsupervised.tif", overwrite=TRUE)
writeRaster(unsup$map, filename = "Products/Mapping/Elbow_AcousticClusters_Unsupervised.grd", overwrite=TRUE)
unsup$map<- raster("Products/Mapping/Elbow_AcousticClusters_Unsupervised.grd")
```

### Interpret Acoustic clusters

Using the ground-truth data points, the substrate observations within each cluster were counted up and each cluster was interpreted as the substrate that occured most often within it. The firt panel shows the cluster number, and the right panel shows the classified map based on majority vote of the ground-truth habitat observations in each cluster (see table below).

```{r echo=TRUE}
acoustic_class<- raster::extract(x = unsup$map, y = filter(TV_set, Set=="Training"), method="simple", factors=FALSE, sp=TRUE)@data
acoustic_class<- acoustic_class %>% select(Substrate, Acoustic_Cluster) %>% na.omit()
votes<- acoustic_class %>% group_by(Acoustic_Cluster) %>% count(Substrate) %>% ungroup()
votes_table<- votes %>% pivot_wider(names_from = Substrate, values_from=n, values_fill=0)
votes_table<- votes_table %>% mutate(Assigned_Class = ifelse(test = Sand>Rock, yes = "Sand", no = "Rock"))
votes_table
```

Figure S1.11

```{r echo=FALSE, fig.width=8, fig.height=8}
q_rules<- votes_table %>% mutate(Assigned_Class= ifelse(test = Assigned_Class=="Rock", yes = 1, no = 2)) %>% select(Acoustic_Cluster, Assigned_Class) %>% as.matrix()

unsup_hab<- raster::reclassify(unsup$map, rcl= q_rules)
names(unsup_hab)<- "Substrate"
unsup_hab<- as.factor(unsup_hab)
levels(unsup_hab)[[1]]$Substrate<- c("Rock", "Sand")

unsup_cluster_plot<- tm_shape(st_as_stars(unsup$map, ignore_file=TRUE), raster.downsample = FALSE)+
  tm_raster(palette = c("blue","green", "purple", "orange"), breaks = 1:5)+
  tm_graticules(lines=FALSE)+
  tm_credits(text = "a)", position = c("right", "top"), size = 1)

unsup_sub_plot<- tm_shape(unsup_hab, raster.downsample = FALSE)+
  tm_raster(palette = c("red","#FFEBBE"))+
  tm_graticules(lines=FALSE)+
  tm_credits(text = "b)", position = c("right", "top"), size = 1)


tmap_arrange(unsup_cluster_plot,unsup_sub_plot)
```

```{r include=FALSE}
writeRaster(unsup_hab, filename = "Products/Mapping/Elbow_Substrate_Unsupervised.tif", overwrite=TRUE)
writeRaster(unsup_hab, filename = "Products/Mapping/Elbow_Substrate_Unsupervised.grd", overwrite=TRUE)
unsup_hab<- raster("Products/Mapping/Elbow_Substrate_Unsupervised.grd")
```

```{r echo=FALSE}
unsup_area<- as.data.frame(freq(unsup_hab, useNA="no", merge=TRUE))
unsup_area$Substrate=c("Rock", "Sand")
unsup_area<- unsup_area %>% mutate(Area_m2= count*(res(all_of(unsup_hab))[1])*(res(all_of(unsup_hab))[2]))
unsup_area<- unsup_area %>% mutate(Area_km2=Area_m2/1e6)
unsup_area %>% select(Substrate, Area_km2) %>% mutate(Area_km2=round(Area_km2,2))
```

### Acuraccy Assessment

Accuracy assessment was performed by comparing the predictions to the observed ground-truth habitat observations on the transect that was reserved for validation and not used to fit the model.

```{r echo=TRUE}
unsup_val<- validateMap(map = unsup_hab, valData = filter(TV_set, Set=="Validation"), responseCol = "Substrate", mode="classification", classMapping=data.frame(class=c("Rock", "Sand"), classID= c(1,2)))
```

```{r echo=FALSE}
print(unsup_val$performance$table)
```
Overall Accuracy = `r round(unsup_val$performance$overall[["Accuracy"]] *100,1)`% 

Kappa = `r round(unsup_val$performance$overall[["Kappa"]],2)`

A Kappa score of greater than `r agreement_levels$min_k[max(which(unsup_val$performance$overall[["Kappa"]]>agreement_levels$min_k))]` indicates `r agreement_levels$Agreement[max(which(unsup_val$performance$overall[["Kappa"]]>agreement_levels$min_k))]` agreement between predictions and observations.


## Comparison of Supervised and Unsupervised

### Difference Maps

For areas of disagreement, the legend is formatted as supervised/unsupervised.

```{r echo=FALSE}
diff_map<- (sup_hab*10)+unsup_hab
diff_map[sup_hab==unsup_hab]<- 1
diff_map<- as.factor(diff_map)
names(diff_map)<- "Difference"
levels(diff_map)[[1]]$Difference<- c("Agreement", "Rock/Sand", "Sand/Rock")
```

```{r include=FALSE}
writeRaster(diff_map, filename = "Products/Mapping/difference_map.tif", overwrite=TRUE)
writeRaster(diff_map, filename = "Products/Mapping/difference_map.grd", overwrite=TRUE)
diff_map<- raster("Products/Mapping/difference_map.grd")
```

Figure S1.12  

For areas of disagreement, the legend is formatted as supervised/unsupervised.  

```{r fig.width=4, fig.height=8}
diff_map_plot<- tm_shape(droplevels(st_as_stars(diff_map, ignore_file=TRUE), exclude=NA), raster.downsample = FALSE)+
  tm_raster(palette = c("palegreen", "red","#FFEBBE"), title="", drop.levels = TRUE)+
   tm_graticules(lines=FALSE)
diff_map_plot
```


### Khist and Kloc

```{r include=FALSE}
hab_map_df<- tibble(sup= values(sup_hab), unsup= values(unsup_hab))
hab_map_df<- na.omit(hab_map_df)
hab_map_df$sup<- factor(hab_map_df$sup, levels = c("1", "2"))
hab_map_df$unsup<- factor(hab_map_df$unsup, levels = c("1", "2"))
hab_map_conf_mat<- confusionMatrix(data = hab_map_df$unsup, reference = hab_map_df$sup)$table

PA<- sum(diag(hab_map_conf_mat)/sum(hab_map_conf_mat))
PE<- rep(NA_real_, nrow(hab_map_conf_mat))
PMax<- rep(NA_real_, nrow(hab_map_conf_mat))

for (i in 1:nrow(hab_map_conf_mat)) {
  p_iT<- (rowSums(hab_map_conf_mat)[i])/sum(hab_map_conf_mat)
  p_Ti<- (colSums(hab_map_conf_mat)[i])/sum(hab_map_conf_mat)
  PE[i]<- p_iT*p_Ti
  PMax[i]<- min(c(p_iT,p_Ti))
}
PE<- sum(PE)
Pmax<- sum(PMax)
K<- (PA-PE)/(1-PE)
Kloc<- (PA-PE)/(Pmax-PE)
Khisto<- (Pmax-PE)/(1-PE)
```

In addition to comparing the agreement between ground-truth observations and predictions, Kappa can be used to compare the agreement in predictions between the two maps by looking at the agreement in corresponding classifications for each pixel. Additionally Kappa can be broken down into two components: Klocation and Khisto, where Kappa is defined as the product of Klocation and Khisto (Pontius, 2000; Hagen, 2002; Sousa et al 2002). Klocation is a measure of the similarity in the spatial distribution of the substrate, and Khisto is measure of the similarity in the frequency of class predictions (Pontius, 2000; Hagen, 2002; Sousa et al 2002). The two maps have a Kloc value of `r round(Kloc,3)` indicating `r agreement_levels$Agreement[Kloc > agreement_levels$min_k & Kloc <= agreement_levels$max_k]` agreement in the spatial distribution of classes, and a Khisto of `r round(Khisto,3)` indicating `r agreement_levels$Agreement[Khisto > agreement_levels$min_k & Khisto <= agreement_levels$max_k]` agreement in the frequency of class predictions. Overall this corresponds to a Kappa value of `r round(K,3)` indicating a `r agreement_levels$Agreement[K > agreement_levels$min_k & K <= agreement_levels$max_k]` level of agreement between the two maps.


### Monte-Carlo Permutation Test

```{r include=FALSE}
val_comp<- tibble(ref=pull(filter(TV_set@data, Set=="Validation"), Substrate),
             sup= raster::extract(x = sup_hab, y=filter(TV_set, Set== "Validation"), na.rm=FALSE),
             unsup= raster::extract(x = unsup_hab, y=filter(TV_set, Set== "Validation"), na.rm=FALSE,))
val_comp<- na.omit(val_comp)
val_comp<- val_comp %>% mutate(sup=ifelse(test = sup==1, yes = "Rock", no = "Sand"),
                               unsup=ifelse(test = unsup==1, yes = "Rock", no = "Sand"))
val_comp<- val_comp %>% mutate_all(factor, levels=c("Rock", "Sand"))
Ksup<- confusionMatrix(data=val_comp$sup, reference=val_comp$ref, positive = "Rock")$overall[["Kappa"]]
Kunsup<- confusionMatrix(data=val_comp$unsup, reference=val_comp$ref, positive = "Rock")$overall[["Kappa"]]
Kappa_Diff<- Ksup-Kunsup
# #Monte carlo
iter<- 1000
kappa_perm<- tibble(K1=rep(NA_real_,iter), K2= NA_real_)
kappa_perm$K1[1]<- Ksup
kappa_perm$K2[1]<- Kunsup
set.seed(5)
for (i in 2:iter) {
  ref_perm<- sample(x = val_comp$ref, size = nrow(val_comp), replace = FALSE)
  kappa_perm$K1[i]<- confusionMatrix(data=val_comp$sup, reference=ref_perm, positive = "Rock")$overall[["Kappa"]]
  kappa_perm$K2[i]<- confusionMatrix(data=val_comp$unsup, reference=ref_perm, positive = "Rock")$overall[["Kappa"]]
  }
kappa_perm<-  kappa_perm %>% mutate(kdiff=K1-K2)
p_val<- sum(kappa_perm$kdiff>=Kappa_Diff)/iter
```

A one-tailed Monte-Carlo permutation test with `r iter` iterations on the difference in Kappa values was performed (McKenzie et al, 1996). This results of this test showed that the supervised classification map provided significantly higher agreement with the ground-truth observations as compared to the unsupervised classification map (p <= `r p_val`).

# Overlaying vertical relief on substrate
Because higher relief features relatively rare, and are fairly small relative to the error associated with positioning of the CBASS, we did not directly try to predict this in the supervised and unsupervised models. However, we can measure vertical relief as a difference between the minimum and maximum values. Total depth was calculated from the CBASS sensors as CBASS depth + CBASS altitude (where altitude measurements have been adjusted for the pitch of the system). Vertical relief for each 15 second bin was calculated as the maximum change in depth over that 15 second observation window. A box plot of the assigned visual relief class, and the measured vertical relief from the sensors is shown below. In blue, orange, and red are the relief cutoffs for low, moderate, and high relief respectively as proposed by Smith et al (2011) in their reef fish survey of the Florida Keys. This appears to match up fairly well with our observations, so we will use these cutoffs:
low relief <= 1m ; 1m < moderate relief <= 2m; high relief > 2m.

Figure S1.13

```{r fig.width=6, fig.height=4}
one_hz<- one_hz %>% mutate(Total_Depth= depth+ altitude*cos(radians(pitch))) #Calculate Total depth

one_hz<- one_hz %>% mutate(vertical_relief_15s = NA_real_)
for (i in 8:(nrow(one_hz)-7)) {
  curr_idx<- (i-7):(i+7)
  if(length(unique(one_hz$Transect[curr_idx]))!=1){
    next() #Prevent averaging across different transects
    }
  dvalues<- one_hz$Total_Depth[curr_idx]
  dmin<- min(dvalues, na.rm=TRUE)
  dmax= max(dvalues, na.rm=TRUE)
  vrelief<- dmax-dmin
  if(is.infinite(vrelief)){vrelief<-NA}
  one_hz$vertical_relief_15s[i]<- vrelief
}

hab_relief<- hab %>% left_join(one_hz, by= c("timestamp","Transect")) %>% select(timestamp, Transect, Relief, vertical_relief_15s) %>% filter(Relief!="None" & Relief != "No_Vis")
hab_relief$Relief<- factor(hab_relief$Relief, levels=c("Low_Relief", "Moderate_Relief", "High_Relief"))

visual_relief_plot<- ggplot(data = hab_relief, mapping = aes(x= Relief, y=vertical_relief_15s))+
  annotate("rect", xmin = -Inf, xmax = Inf, ymin = 0, ymax = 1, fill= "dodgerblue", alpha=.2)+
  annotate("rect", xmin = -Inf, xmax = Inf, ymin = 1, ymax = 2, fill= "orange", alpha=.2)+
  annotate("rect", xmin = -Inf, xmax = Inf, ymin = 2, ymax = Inf, fill= "red", alpha=.2)+
  geom_boxplot()+
  scale_y_continuous(breaks = seq(0, 8, by= 1), expand = c(0,0))+
  ylab("Vertical Relief (m) over 15 s")+
  xlab("Relief Class")
visual_relief_plot
```

These thresholds are then used to delineate higher relief areas directly from the bathymetry by calculating the difference between a central pixel and the minimum depth of surrounding pixels using a sliding 3x3 window.

Figure S1.14

```{r warning=FALSE}
w<- matrix(data = 1, nrow = 3,ncol = 3) #3x3 window
w[5]<- NA
min_bathy<- focal(x = bathy, w=w, fun=min, na.rm=TRUE) #Calculate minimum depth around a central pixel
min_bathy[is.infinite(min_bathy)]<- NA #Replace Inf with NA
relief<- bathy-min_bathy #Calculate vertical relief

relief_qrules<- matrix(c(-Inf,1,2, 1, 2, Inf, 1,2,3), nrow = 3,ncol = 3)
relief_classified<- raster::reclassify(relief, relief_qrules, include.lowest=FALSE, right=TRUE)
relief_classified<- as.factor(relief_classified)
levels(relief_classified)[[1]]$Relief_Class<- c("Low","Mod", "High")
```

```{r include= FALSE}
writeRaster(min_bathy, "Data/derived/Multibeam/min_bathy.grd", overwrite=TRUE)
min_bathy<- raster("Data/derived/Multibeam/min_bathy.grd")
writeRaster(relief, "Data/derived/Multibeam/relief.grd", overwrite=TRUE)
relief<- raster("Data/derived/Multibeam/relief.grd")
writeRaster(relief_classified, "Data/derived/Multibeam/relief_classified.grd",overwrite=TRUE)
relief_classified<- raster("Data/derived/Multibeam/relief_classified.grd")
```

```{r fig.width=8, fig.height=8}
Relief_Map_Plot<- tm_shape(relief, raster.downsample = FALSE)+
  tm_raster(palette=pal1, midpoint = NA, style="cont", title = "Relief (m)")+
   tm_graticules(lines=FALSE)
Relief_Classified_Plot<- tm_shape(st_as_stars(relief_classified, ignore_file=TRUE), raster.downsample = FALSE)+
  tm_raster(palette=c("gray", "yellow", "red"), title = "Relief Class")+
   tm_graticules(lines=FALSE)
tmap_arrange(Relief_Map_Plot, Relief_Classified_Plot)
```


Figure S1.15
```{r }
sub_relief<- (sup_hab*10)+relief_classified
sub_relief[sub_relief==22]<-21
sub_relief[sub_relief==23]<-21 #Collapse sand to one category
sub_relief<- as.factor(sub_relief)
names(sub_relief)<- "Habitat"
levels(sub_relief)[[1]]$habitat<-  c("LR Rock", "MR Rock", "HR Rock", "Sand")
```

```{r include=FALSE}
writeRaster(sub_relief, filename = "Products/Mapping/Elbow_Supervised_HabRelief.tif", overwrite=TRUE)
writeRaster(sub_relief, filename = "Products/Mapping/Elbow_Supervised_HabRelief.grd", overwrite=TRUE)
sub_relief<- raster("Products/Mapping/Elbow_Supervised_HabRelief.grd")
```

```{r}
sub_relief_plot<- tm_shape(droplevels(st_as_stars(sub_relief, ignore_file=TRUE), exclude=NA), raster.downsample = FALSE)+
  tm_raster(palette = c("dodgerblue", "orange", "red", "#FFEBBE"))+
   tm_graticules(lines=FALSE)
sub_relief_plot
```


```{r echo=FALSE}
hab_area<- as.data.frame(freq(sub_relief, useNA="no", merge=TRUE))
hab_area$habitat<- c("LR_Rock", "MR_Rock", "HR_Rock", "Sand")
hab_area<- hab_area %>% mutate(Area_m2= count*(res(all_of(sub_relief))[1])*(res(all_of(sub_relief))[2]))
hab_area<- hab_area %>% mutate(Area_km2=Area_m2/1e6)
hab_area<- hab_area %>% mutate(Area_percent=100*Area_km2/sum(Area_km2))
hab_area %>% select(habitat, Area_km2, Area_percent) %>% mutate(across(where(is.numeric), .fns=round, 2))
```

```{r echo=FALSE}
save.image(file = 'Data/R_environment/Elbow_R_Mapping_Environment.RData')
```

```{r include=FALSE}
tmap_save(tm=gtruth_plot,
          filename = "Products/figures/gtruth_plot.tiff",
          width=8,
          height = 8,
          units="in",
          dpi=300)

tmap_save(tm=sup_retained_var_plot,
          filename = "Products/figures/sup_retained_var_plot.tiff",
          width=7,
          height = 8,
          units="in",
          dpi=300)

tmap_save(tm= tmap_arrange(sup_sub_plot, sup_ent_plot),
          filename = "Products/figures/SupSubEnt.tiff",
          width=8,
          height = 8,
          units="in",
          dpi=300)

ggsave(filename = "var_imp_plot.tiff",
       plot = var_imp_plot+theme(text = element_text(size = 7)), 
       device = "tiff", 
       path = "Products/figures", 
       width = 6, 
       height = 4, 
       units = "in", 
       dpi = 300)

ggsave(filename = "err_vs_ntree.tiff",
       plot = err_vs_ntree+theme(text = element_text(size = 7)), 
       device = "tiff", 
       path = "Products/figures", 
       width = 6, 
       height = 4, 
       units = "in", 
       dpi = 300)

ggsave(filename = "PCA_var_plot.tiff",
       plot = PCA_var_plot+theme(text = element_text(size = 7))  , 
       device = "tiff", 
       path = "Products/figures", 
       width = 6, 
       height = 4, 
       units = "in", 
       dpi = 300)

tmap_save(tm=ret_PCA_plot,
          filename = "Products/figures/ret_PCA_plot.tiff",
          width=6,
          height = 8,
          units="in",
          dpi=300)

tmap_save(tm=tmap_arrange(unsup_cluster_plot,unsup_sub_plot),
          filename = "Products/figures/clust_unsup__plot.tiff",
          width=8,
          height = 8,
          units="in",
          dpi=300)

tmap_save(tm=diff_map_plot,
          filename = "Products/figures/diff_map_plot.tiff",
          width=6,
          height = 8,
          units="in",
          dpi=300)

tmap_save(tm=tmap_arrange(Relief_Map_Plot, Relief_Classified_Plot),
          filename = "Products/figures/relief_map_plot.tiff",
          width=8,
          height = 8,
          units="in",
          dpi=300)

tmap_save(tm=sub_relief_plot,
          filename = "Products/figures/sub_relief_plot.tiff",
          width=6,
          height = 8,
          units="in",
          dpi=300)

ggsave(filename = "visual_relief_plot.tiff",
       plot = visual_relief_plot, 
       device = "tiff", 
       path = "Products/figures", 
       width = 6, 
       height = 6, 
       units = "in", 
       dpi = 300)
```

