From 63291504b656e1a02effa33af5d84072ba3328aa Mon Sep 17 00:00:00 2001
From: vwmaus <vwmaus1@gmail.com>
Date: Fri, 8 Sep 2023 00:21:47 +0200
Subject: [PATCH] Adds knn-1-twdtw train and predict methods for stars objects

---
 DESCRIPTION                          |   7 +-
 NAMESPACE                            |  14 +--
 R/create_patterns.R                  | 146 -------------------------
 R/plot_patterns.R                    |  62 +++++------
 R/predict.R                          |  34 ++++++
 R/prepare_time_series.R              |  81 ++++++++++++++
 R/{utils.R => shift_dates.R}         |  20 +---
 R/train.R                            | 154 +++++++++++++++++++++++++++
 R/zzz.R                              |   6 +-
 inst/mato_grosso_brazil/samples.gpkg | Bin 135168 -> 135168 bytes
 man/create_patterns.Rd               |  41 -------
 man/extract_time_series.Rd           |  21 ----
 man/get_stars_time_freq.Rd           |   2 +-
 man/knn1_twdtw.Rd                    |  53 +++++++++
 man/plot.knn1_twdtw.Rd               |  31 ++++++
 man/plot_patterns.Rd                 |  26 -----
 man/predict.knn1_twdtw.Rd            |  24 +++++
 man/prepare_time_series.Rd           |  20 ++++
 man/shift_dates.Rd                   |   2 +-
 tests/testthat/test-twdtw_classify.R |  30 +++---
 20 files changed, 464 insertions(+), 310 deletions(-)
 delete mode 100644 R/create_patterns.R
 create mode 100644 R/predict.R
 create mode 100644 R/prepare_time_series.R
 rename R/{utils.R => shift_dates.R} (76%)
 create mode 100644 R/train.R
 delete mode 100644 man/create_patterns.Rd
 delete mode 100644 man/extract_time_series.Rd
 create mode 100644 man/knn1_twdtw.Rd
 create mode 100644 man/plot.knn1_twdtw.Rd
 delete mode 100644 man/plot_patterns.Rd
 create mode 100644 man/predict.knn1_twdtw.Rd
 create mode 100644 man/prepare_time_series.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 4e08634..51051a4 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -47,11 +47,10 @@ Depends:
 Imports:
     mgcv,
     stats,
-    scales,
-    reshape2,
-    rlang
+    tidyr,
+    rlang,
+    proxy
 Suggests:
     rbenchmark,
-    stringr,
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
index fb17b02..d91ee48 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,7 +1,8 @@
 # Generated by roxygen2: do not edit by hand
 
-export(create_patterns)
-export(plot_patterns)
+S3method(plot,knn1_twdtw)
+S3method(predict,knn1_twdtw)
+export(knn1_twdtw)
 export(shift_dates)
 import(ggplot2)
 import(sf)
@@ -10,11 +11,12 @@ import(twdtw)
 importFrom(mgcv,gam)
 importFrom(mgcv,predict.gam)
 importFrom(mgcv,s)
-importFrom(reshape2,melt)
+importFrom(proxy,dist)
 importFrom(rlang,.data)
-importFrom(scales,date_format)
-importFrom(scales,percent)
-importFrom(scales,pretty_breaks)
 importFrom(stats,as.formula)
 importFrom(stats,predict)
 importFrom(stats,setNames)
+importFrom(tidyr,nest)
+importFrom(tidyr,pivot_longer)
+importFrom(tidyr,pivot_wider)
+importFrom(tidyr,unnest)
diff --git a/R/create_patterns.R b/R/create_patterns.R
deleted file mode 100644
index 856d1dd..0000000
--- a/R/create_patterns.R
+++ /dev/null
@@ -1,146 +0,0 @@
-#' Create a Pattern Using GAM
-#'
-#' This function creates a pattern based on Generalized Additive Models (GAM).
-#' It uses the specified formula to fit the model and predict values.
-#'
-#' @param x A three dimensions stars object (x, y, time) with the satellite image time series.
-#' @param y An sf object with the coordinates of the training points.
-#' @param formula A formula for the GAM. Default is \code{band ~ \link[mgcv]{s}(time)}.
-#' @param start_column Name of the column in y that indicates the start date. Default is 'start_date'.
-#' @param end_column Name of the column in y that indicates the end date. Default is 'end_date'.
-#' @param label_colum Name of the column in y that contains land use labels. Default is 'label'.
-#' @param sampling_freq The time frequency for sampling including unit, e.g '16 day'. If NULL, the function will infer it.
-#' @param ... Additional arguments passed to the GAM function.
-#'
-#' @return A list containing the predicted values for each label.
-#'
-#' 
-#'
-#' @export
-create_patterns = function(x, y, formula = band ~ s(time), start_column = 'start_date', 
-                          end_column = 'end_date', label_colum = 'label', 
-                          sampling_freq = NULL, ...){
-  
-  # Check if x is a stars object with a time dimension
-  if (!inherits(x, "stars") || dim(x)['time'] < 1) {
-    stop("x must be a stars object with a 'time' dimension")
-  }
-
-  # Check if y is an sf object with point geometry
-  if (!inherits(y, "sf") || !all(st_is(y, "POINT"))) {
-    stop("y must be an sf object with point geometry")
-  }
-
-  # Check for required columns in y
-  required_columns <- c(start_column, end_column, label_colum)
-  missing_columns <- setdiff(required_columns, names(y))
-  if (length(missing_columns) > 0) {
-    stop(paste("Missing required columns in y:", paste(missing_columns, collapse = ", ")))
-  }
-
-  # Check if formula has two 
-  if(length(all.vars(formula)) != 2) {
-    stop("The formula should have only one predictor!")
-  }
-
-  # Convert columns to date-time
-  y[ , start_column] <- to_date_time(y[[start_column]])
-  y[ , end_column] <- to_date_time(y[[end_column]])
-
-  # Extract time series from stars
-  y_ts <- extract_time_series(x, y)
-  y_ts$geom <- NULL
-
-  # Shift dates
-  unique_ids <- unique(y_ts$id)
-  for (id in seq_along(unique_ids)) {
-    idx <- y_ts$id == unique_ids[id]
-    y_ts[idx, 'time'] <- shift_dates(y_ts[idx, 'time'])
-  }
-
-  # Split data frame by label and remove label column
-  y_ts <- lapply(split(y_ts, y_ts$label), function(df) {
-    df$label <- NULL
-    return(df)
-  })
-
-  # Determine sampling frequency
-  if (is.null(sampling_freq)) {
-    sampling_freq <- get_stars_time_freq(x)
-    cat("Sampling frequency inferred from the stars object:", as.numeric(sampling_freq), attr(sampling_freq, "units"), "\n")
-  }
-
-  # Define GAM function
-  gam_fun <- function(y, t, formula, ...){
-    df <- data.frame(y, t = as.numeric(t))
-    df <- setNames(list(y, as.numeric(t)), all.vars(formula))
-    fit <- mgcv::gam(data = df, formula = formula, ...)
-    pred_t <- setNames(list(as.numeric(seq(min(t), max(t), by = sampling_freq))), all.vars(formula)[2])
-    predict(fit, newdata = pred_t)
-  }
-
-  # Apply GAM function
-  patterns <- lapply(y_ts, function(ts){
-    y_time <- ts$time
-    ts$time <- NULL
-    ts$id <- NULL
-    sapply(as.list(ts), function(y) {
-      gam_fun(y, y_time, formula, ...)
-    })
-  })
-
-  return(patterns)
-
-}
-
-
-
-#' Extract Time Series from a Stars Object for Specified Points
-#'
-#' This function extracts a time series from a stars object for each specified point in the sf object. 
-#' Each extracted sample is then labeled with an ID and the label from the sf object.
-#'
-#' @param x A stars object containing the raster time series data.
-#' @param y An sf object containing the point geometries and their associated labels.
-#'
-#' @return A data.frame with the extracted time series for each point in the sf object, 
-#'         with additional columns for the ID and label of each sample.
-#'
-#'
-extract_time_series <- function(x, y) {
-  ts_samples <- st_extract(x, y)
-  ts_samples$id <- 1:dim(ts_samples)["geom"]
-  ts_samples$label <- y$label
-  as.data.frame(ts_samples)
-}
-
-
-
-#' Compute the Most Common Sampling Frequency in a Stars Object
-#'
-#' This function calculates the most common difference between consecutive time points in a stars object. 
-#' This can be useful for determining the sampling frequency of the time series data.
-#'
-#' @param x A stars object containing time series data.
-#'
-#' @return A difftime object representing the most common time difference between consecutive samples.
-#'
-#'
-get_stars_time_freq <- function(x) {
-
-  # Extract the time dimension
-  time_values <- st_get_dimension_values(x, "time")
-
-  # Compute the differences between consecutive time points
-  time_diffs <- diff(time_values)
-
-  # Convert differences to days (while retaining the difftime class)
-  time_diffs <- as.difftime(time_diffs, units = "days")
-
-  # Identify the mode
-  mode_val_index <- which.max(tabulate(match(time_diffs, unique(time_diffs))))
-  freq <- diff(time_values[mode_val_index:(mode_val_index+1)])
-
-  return(freq)
-}
-
diff --git a/R/plot_patterns.R b/R/plot_patterns.R
index dd983b8..bed19f3 100644
--- a/R/plot_patterns.R
+++ b/R/plot_patterns.R
@@ -1,41 +1,43 @@
 #' Plot Patterns from Time Series Data
 #'
-#' This function takes a list of time series data and creates a multi-faceted plot
-#' where each facet corresponds to a different time series from the list. 
-#' Within each facet, different attributes (columns of the time series) are 
-#' plotted as lines with different colors.
+#' This function visualizes time series data patterns from the \code{"knn1_twdtw"} model.
+#' It produces a multi-faceted plot,
+#' where each facet represents a different time series pattern from the model's data.
+#' Within each facet, different attributes (e.g., bands or indices) are plotted as
+#' distinct lines, differentiated by color.
 #'
-#' @param x A list where each element is a data.frame representing a time series.
-#'          Each data.frame should have the same number of rows and columns, 
-#'          with columns representing different attributes (e.g., bands or indices)
-#'          and rows representing time points.
-#'          The name of each element in the list will be used as the facet title.
+#' @param x A model of class \code{"knn1_twdtw"}.
 #' 
-#' @param ... Not used.
+#' @param n An integer specifying the number of patterns to plot. Default is 12.
 #' 
-#' @return A ggplot object displaying the time series patterns.
+#' @param ... Additional arguments. Currently not used.
+#' 
+#' @details
+#' The function is designed for visual inspection of time series patterns from the model. 
+#' Each data frame in the model's data represents a time series. Columns within the data frame 
+#' correspond to different attributes (e.g., bands or indices), while rows represent time points. 
+#' The facet title is derived from the name of each time series in the model's data.
+#'
+#' @return A \code{\link[ggplot2]{ggplot}} object displaying the time series patterns.
 #'
 #' @export
-plot_patterns = function(x, ...) {
-   
+plot.knn1_twdtw <- function(x, n = 12, ...) {
+
   # Convert the list of time series data into a long-format data.frame
-  df.p = do.call("rbind", lapply(names(x), function(p) {
-    ts = x[[p]]
-    # Create a new data.frame with a 'Time' column and a 'Pattern' column
-    # representing the name of the current time series (facet name).
-    data.frame(Time = 1:nrow(ts), ts, Pattern = p) # Assuming the time series are evenly spaced
-  }))
-  
+  df <- unnest(x$data[1:n, ], cols = .data$observations)
+
   # Melt the data into long format suitable for ggplot2
-  df.p = melt(df.p, id.vars = c("Time", "Pattern"))
-  
+  df <- pivot_longer(df, !c(.data$label, .data$time), names_to = "band", values_to = "value")
+
   # Construct the ggplot
-  gp = ggplot(df.p, aes(x = .data$Time, y = .data$value, colour = .data$variable)) + 
-    geom_line() + 
-    facet_wrap(~Pattern) + 
-    theme(legend.position = "bottom") + 
-    guides(colour = guide_legend(title = "Bands")) + 
-    ylab("Value")
-  
+  gp <- ggplot(df, aes(x = .data$time, y = .data$value, colour = .data$band)) +
+    geom_line() +
+    facet_wrap(~label) +
+    theme(legend.position = "bottom") +
+    guides(colour = guide_legend(title = "Bands")) +
+    ylab("Value") +
+    xlab("Time")
+
   return(gp)
-}
\ No newline at end of file
+
+}
diff --git a/R/predict.R b/R/predict.R
new file mode 100644
index 0000000..1df788a
--- /dev/null
+++ b/R/predict.R
@@ -0,0 +1,34 @@
+#' Predict using the knn1_twdtw model
+#'
+#' This function predicts the classes of new data using the Time Warped Dynamic Time Warping (TWDTW) 
+#' method with a 1-nearest neighbor approach. The prediction is based on the minimum TWDTW distance 
+#' to the known patterns stored in the `knn1_twdtw` model.
+#'
+#' @param object A `knn1_twdtw` model object generated by the `knn1_twdtw` function.
+#' @param newdata A data frame or similar object containing the new observations 
+#' (time series data) to be predicted.
+#' @param ... Additional arguments passed to the \link[twdtw]{twdtw} function.
+#'
+#' @return A vector of predicted classes for the `newdata`.
+#'
+#'
+#' @export
+predict.knn1_twdtw <- function(object, newdata, ...){
+
+
+  # Convert newdata to time series
+  newdata_ts <- prepare_time_series(newdata)
+
+  # Compute TWDTW distances
+  distances <- sapply(object$data$observations, function(pattern){
+    sapply(newdata_ts$observations, function(ts) {
+        proxy::dist(x = as.data.frame(ts), y = as.data.frame(pattern), method = "twdtw", ...)
+    })
+  })
+
+  # Find the nearest neighbor for each observation in newdata
+  nearest_neighbor <- apply(distances, 1, which.min)
+
+  # Return the predicted label for each observation based on the nearest neighbor
+  return(factor(object$data$label[nearest_neighbor]))
+}
diff --git a/R/prepare_time_series.R b/R/prepare_time_series.R
new file mode 100644
index 0000000..d53bba4
--- /dev/null
+++ b/R/prepare_time_series.R
@@ -0,0 +1,81 @@
+#' Prepare a Time Series Tibble from a 2D stars Object with Bands and Time Attributes
+#'
+#' This function reshapes a data frame, which has been converted from a stars object, into a nested wide tibble format.
+#' The stars object conversion often results in columns named in formats like "band.YYYY.MM.DD", "XYYYY.MM.DD.band", or "YYYY.MM.DD.band".
+#'
+#' @param x A data frame derived from a stars object containing time series data in wide format.
+#' The column names should adhere to one of the following formats: "band.YYYY.MM.DD", "XYYYY.MM.DD.band", or "YYYY.MM.DD.band".
+#'
+#' @return A nested tibble in wide format. Each row of the tibble corresponds to a unique 'ts_id' that maintains the order from the original stars object.
+#' The nested structure contains observations (time series) for each 'ts_id', including the 'time' of each observation, and individual bands are presented as separate columns.
+#'
+#'
+prepare_time_series <- function(x) {
+
+  # Remove the 'geom' column if it exists
+  x$geom <- NULL
+  var_names <- names(x)
+  var_names <- var_names[!var_names %in% 'label']
+
+  # Extract date and band information from the column names
+  date_band <- do.call(rbind, lapply(var_names, function(name) {
+
+    # Replace any hyphen with periods for consistent processing
+    name <- gsub("-", "\\.", name)
+
+    # Extract date and band info based on different name patterns
+    if (grepl("^.+\\.\\d{4}\\.\\d{2}\\.\\d{2}$", name)) {
+      date_str <- gsub("^.*?(\\d{4}\\.\\d{2}\\.\\d{2})$", "\\1", name)
+      band_str <- gsub("^(.+)\\.\\d{4}\\.\\d{2}\\.\\d{2}$", "\\1", name)
+    }
+    else if (grepl("^X\\d{4}\\.\\d{2}\\.\\d{2}\\..+$", name)) {
+      date_str <- gsub("^X(\\d{4}\\.\\d{2}\\.\\d{2})\\..+$", "\\1", name)
+      band_str <- gsub("^X\\d{4}\\.\\d{2}\\.\\d{2}\\.(.+)$", "\\1", name)
+    } 
+    else if (grepl("^\\d{4}\\.\\d{2}\\.\\d{2}\\..+$", name)) {
+      date_str <- gsub("^(\\d{4}\\.\\d{2}\\.\\d{2})\\..+$", "\\1", name)
+      band_str <- gsub("^\\d{4}\\.\\d{2}\\.\\d{2}\\.(.+)$", "\\1", name)
+    } 
+    else {
+      stop(paste("Unrecognized format in:", name))
+    }
+
+    # Convert the date string to Date format
+    date <- to_date_time(gsub("\\.", "-", date_str))
+
+    return(data.frame(time = date, band = band_str))
+  }))
+
+  # Construct tiem sereis
+  ns <- nrow(x)
+  x$ts_id <- 1:ns
+  if (!'label' %in% names(x)) {
+    x$label <- NA
+  }
+  x <- pivot_longer(x, !c(.data$ts_id, .data$label), names_to = "band_date", values_to = "value")
+  x$band <- rep(date_band$band, ns)
+  x$time <- rep(date_band$time, ns)
+  x$band_date <- NULL
+  result_df <- pivot_wider(x, id_cols = c(.data$ts_id, .data$label, .data$time), names_from = 'band', values_from = 'value')
+  result_df <- nest(result_df, .by = c(.data$ts_id, .data$label), .key = "observations")
+
+  return(result_df)
+
+}
+
+
+#### TO BE REMOVED: twdtw package will export this fucntion
+to_date_time <- function(x){
+  if (!inherits(x, c("Date", "POSIXt"))) {
+    # check if all strings in the vector include hours, minutes, and seconds
+    if (all(grepl("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", x))) {
+      x <- try(as.POSIXct(x), silent = TRUE)
+    } else {
+      x <- try(as.Date(x), silent = TRUE)
+    }
+    if (inherits(x, "try-error")) {
+      stop("Some elements of x could not be converted to a date or datetime format")
+    }
+  }
+  return(x)
+}
diff --git a/R/utils.R b/R/shift_dates.R
similarity index 76%
rename from R/utils.R
rename to R/shift_dates.R
index b9bb556..a37e060 100644
--- a/R/utils.R
+++ b/R/shift_dates.R
@@ -43,22 +43,8 @@ shift_dates <- function(x, origin = "1970-01-01") {
   return(shifted_dates)
 }
 
-#### TO BE REMOVED: twdtw package will export this fucntion 
-to_date_time <- function(x){
-  if (!inherits(x, c("Date", "POSIXt"))) {
-    # check if all strings in the vector include hours, minutes, and seconds
-    if (all(grepl("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", x))) {
-      x <- try(as.POSIXct(x), silent = TRUE)
-    } else {
-      x <- try(as.Date(x), silent = TRUE)
-    }
-    if (inherits(x, "try-error")) {
-      stop("Some elements of x could not be converted to a date or datetime format")
-    }
-  }
+
+shift_ts_dates <- function(x) {
+  x$time <- shift_dates(x$time)
   return(x)
 }
-
-
-
-
diff --git a/R/train.R b/R/train.R
new file mode 100644
index 0000000..06026c9
--- /dev/null
+++ b/R/train.R
@@ -0,0 +1,154 @@
+#'
+#' Train a KNN-1 TWDTW model with optional GAM resampling
+#'
+#' This function prepares a KNN-1 model with the Time Warp Dynamic Time Warping (TWDTW) algorithm.
+#' If a formula is provided, the training samples are resampled using Generalized Additive Models (GAM).
+#'
+#' @param x A two-dimensional stars object (x, y) with time and bands as attributes.
+#' @param y An sf object with the coordinates of the training points.
+#' @param formula Either NULL or a formula to reduce samples of the same label using Generalized Additive Models (GAM).
+#' Default is \code{band ~ s(time)}. See details.
+#' @param start_column Name of the column in y that indicates the start date. Default is 'start_date'.
+#' @param end_column Name of the column in y that indicates the end date. Default is 'end_date'.
+#' @param label_colum Name of the column in y containing land use labels. Default is 'label'.
+#' @param sampling_freq The time frequency for sampling, including the unit (e.g., '16 day').
+#' If NULL, the function will infer the frequency. This parameter is only used if a formula is provided.
+#' @param ... Additional arguments passed to the GAM function.
+#'
+#' @details If \code{formula} is NULL, the KNN-1 model will retain all training samples. If a formula is passed (e.g., \code{band ~ \link[mgcv]{s}(time)}),
+#' then samples of the same label (land cover class) will be resampled using GAM.
+#' Resampling can significantly reduce prediction processing time.
+#'
+#' @return A 'knn1_twdtw' model containing the trained model information and the data used.
+#'
+#' @examples
+#'
+#' # Example will be added once the function is properly implemented and tested.
+#' 
+#' @export
+knn1_twdtw <- function(x, y, formula = NULL, start_column = 'start_date',
+                       end_column = 'end_date', label_colum = 'label',
+                       sampling_freq = NULL, ...){
+
+  # Check if x is a stars object with a time dimension
+  if (!inherits(x, "stars") || length(dim(x)) != 2) {
+    stop("x must be a stars object with two dimensions")
+  }
+
+  # Check if y is an sf object with point geometry
+  if (!inherits(y, "sf") || !all(st_is(y, "POINT"))) {
+    stop("y must be an sf object with point geometry")
+  }
+
+  # Check for required columns in y
+  required_columns <- c(start_column, end_column, label_colum)
+  missing_columns <- setdiff(required_columns, names(y))
+  if (length(missing_columns) > 0) {
+    stop(paste("Missing required columns in y:", paste(missing_columns, collapse = ", ")))
+  }
+
+  # Convert columns to date-time
+  y[, start_column] <- to_date_time(y[[start_column]])
+  y[, end_column] <- to_date_time(y[[end_column]])
+
+  # Prepare time series samples from stars object
+  ts_data <- st_extract(x, y)
+  ts_data$label <- y[[label_colum]]
+  ts_data <- prepare_time_series(as.data.frame(ts_data))
+  ts_data$ts_id <- NULL
+
+  if(!is.null(formula)) {
+
+    # Check if formula has two
+    if(length(all.vars(formula)) != 2) {
+      stop("The formula should have only one predictor!")
+    }
+
+    # Determine sampling frequency
+    if (is.null(sampling_freq)) {
+      sampling_freq <- get_time_series_freq(ts_data)
+    }
+
+    # Shift dates
+    ts_data$observations <- lapply(ts_data$observations, shift_ts_dates)
+
+    # Split data frame by label
+    ts_data <- unnest(ts_data, cols = .data$observations)
+    ts_data <- nest(ts_data, .by = .data$label, .key = "observations")
+
+    # Define GAM function
+    gam_fun <- function(band, t, pred_t, formula, ...){
+      df <- setNames(list(band, as.numeric(t)), all.vars(formula))
+      pred_t[[all.vars(formula)[2]]] <- as.numeric(pred_t[[all.vars(formula)[2]]])
+      fit <- mgcv::gam(data = df, formula = formula, ...)
+      predict(fit, newdata = pred_t)
+    }
+
+    # Apply GAM function
+    ts_data$observations <- lapply(ts_data$observations, function(ts){
+      y_time <- ts$time
+      ts$time <- NULL
+      pred_time <- setNames(list(seq(min(y_time), max(y_time), by = sampling_freq)), all.vars(formula)[2])
+      cbind(pred_time, as.data.frame(sapply(as.list(ts), function(band) {
+        gam_fun(band, y_time, pred_time, formula, ...)
+      })))
+    })
+
+  }
+
+  model <- list()
+  model$call <- match.call()
+  model$formula <- formula
+  model$data <- ts_data
+  class(model) <- "knn1_twdtw"
+
+  return(model)
+
+}
+
+#' Compute the Most Common Sampling Frequency in a Stars Object
+#'
+#' This function calculates the most common difference between consecutive time points in a stars object. 
+#' This can be useful for determining the sampling frequency of the time series data.
+#'
+#' @param x A stars object containing time series data.
+#'
+#' @return A difftime object representing the most common time difference between consecutive samples.
+#'
+#'
+get_stars_time_freq <- function(x) {
+
+  # Extract the time dimension
+  time_values <- st_get_dimension_values(x, "time")
+
+  # Compute the differences between consecutive time points
+  time_diffs <- diff(time_values)
+
+  # Convert differences to days (while retaining the difftime class)
+  time_diffs <- as.difftime(time_diffs, units = "days")
+
+  # Identify the mode
+  mode_val_index <- which.max(tabulate(match(time_diffs, unique(time_diffs))))
+  freq <- diff(time_values[mode_val_index:(mode_val_index+1)])
+
+  return(freq)
+}
+
+get_time_series_freq <- function(x) {
+
+  # Extract the time dimension
+  time_values <- unlist(lapply(x$observations, function(ts) ts$time))
+
+  # Compute the differences between consecutive time points
+  time_diffs <- unlist(lapply(x$observations, function(ts) diff(ts$time)))
+
+  # Convert differences to days (while retaining the difftime class)
+  time_diffs <- as.difftime(time_diffs, units = "days")
+
+  # Identify the mode
+  mode_val_index <- which.max(tabulate(match(time_diffs, unique(time_diffs))))
+  freq <- diff(time_values[mode_val_index:(mode_val_index+1)])
+
+  return(freq)
+
+}
\ No newline at end of file
diff --git a/R/zzz.R b/R/zzz.R
index 499fd6c..9e28f89 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -3,9 +3,9 @@
 #' @import stars
 #' @import ggplot2
 #' @importFrom stats as.formula predict setNames
-#' @importFrom mgcv gam s predict.gam 
-#' @importFrom scales pretty_breaks date_format percent
-#' @importFrom reshape2 melt
+#' @importFrom mgcv gam s predict.gam
 #' @importFrom rlang .data
+#' @importFrom tidyr pivot_longer pivot_wider nest unnest
+#' @importFrom proxy dist
 #' 
 NULL
diff --git a/inst/mato_grosso_brazil/samples.gpkg b/inst/mato_grosso_brazil/samples.gpkg
index 2fa2e37805a0d741c78f794137fefcd1df3bf420..888cbee560a06f5172e88295c069a08a9af749a4 100644
GIT binary patch
delta 25
gcmZozz|pXPV}djz??f4AM&8DR)&$0_2}}$40cIx$l>h($

delta 25
gcmZozz|pXPV}djz_e2?IM()Oh)&$0_2}}$40cG(AkpKVy

diff --git a/man/create_patterns.Rd b/man/create_patterns.Rd
deleted file mode 100644
index 0ff9bd5..0000000
--- a/man/create_patterns.Rd
+++ /dev/null
@@ -1,41 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/create_patterns.R
-\name{create_patterns}
-\alias{create_patterns}
-\title{Create a Pattern Using GAM}
-\usage{
-create_patterns(
-  x,
-  y,
-  formula = band ~ s(time),
-  start_column = "start_date",
-  end_column = "end_date",
-  label_colum = "label",
-  sampling_freq = NULL,
-  ...
-)
-}
-\arguments{
-\item{x}{A three dimensions stars object (x, y, time) with the satellite image time series.}
-
-\item{y}{An sf object with the coordinates of the training points.}
-
-\item{formula}{A formula for the GAM. Default is \code{band ~ \link[mgcv]{s}(time)}.}
-
-\item{start_column}{Name of the column in y that indicates the start date. Default is 'start_date'.}
-
-\item{end_column}{Name of the column in y that indicates the end date. Default is 'end_date'.}
-
-\item{label_colum}{Name of the column in y that contains land use labels. Default is 'label'.}
-
-\item{sampling_freq}{The time frequency for sampling including unit, e.g '16 day'. If NULL, the function will infer it.}
-
-\item{...}{Additional arguments passed to the GAM function.}
-}
-\value{
-A list containing the predicted values for each label.
-}
-\description{
-This function creates a pattern based on Generalized Additive Models (GAM).
-It uses the specified formula to fit the model and predict values.
-}
diff --git a/man/extract_time_series.Rd b/man/extract_time_series.Rd
deleted file mode 100644
index c4aebe3..0000000
--- a/man/extract_time_series.Rd
+++ /dev/null
@@ -1,21 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/create_patterns.R
-\name{extract_time_series}
-\alias{extract_time_series}
-\title{Extract Time Series from a Stars Object for Specified Points}
-\usage{
-extract_time_series(x, y)
-}
-\arguments{
-\item{x}{A stars object containing the raster time series data.}
-
-\item{y}{An sf object containing the point geometries and their associated labels.}
-}
-\value{
-A data.frame with the extracted time series for each point in the sf object,
-with additional columns for the ID and label of each sample.
-}
-\description{
-This function extracts a time series from a stars object for each specified point in the sf object.
-Each extracted sample is then labeled with an ID and the label from the sf object.
-}
diff --git a/man/get_stars_time_freq.Rd b/man/get_stars_time_freq.Rd
index 47d90c4..09b3890 100644
--- a/man/get_stars_time_freq.Rd
+++ b/man/get_stars_time_freq.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/create_patterns.R
+% Please edit documentation in R/train.R
 \name{get_stars_time_freq}
 \alias{get_stars_time_freq}
 \title{Compute the Most Common Sampling Frequency in a Stars Object}
diff --git a/man/knn1_twdtw.Rd b/man/knn1_twdtw.Rd
new file mode 100644
index 0000000..02627bb
--- /dev/null
+++ b/man/knn1_twdtw.Rd
@@ -0,0 +1,53 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/train.R
+\name{knn1_twdtw}
+\alias{knn1_twdtw}
+\title{Train a KNN-1 TWDTW model with optional GAM resampling}
+\usage{
+knn1_twdtw(
+  x,
+  y,
+  formula = NULL,
+  start_column = "start_date",
+  end_column = "end_date",
+  label_colum = "label",
+  sampling_freq = NULL,
+  ...
+)
+}
+\arguments{
+\item{x}{A two-dimensional stars object (x, y) with time and bands as attributes.}
+
+\item{y}{An sf object with the coordinates of the training points.}
+
+\item{formula}{Either NULL or a formula to reduce samples of the same label using Generalized Additive Models (GAM).
+Default is \code{band ~ s(time)}. See details.}
+
+\item{start_column}{Name of the column in y that indicates the start date. Default is 'start_date'.}
+
+\item{end_column}{Name of the column in y that indicates the end date. Default is 'end_date'.}
+
+\item{label_colum}{Name of the column in y containing land use labels. Default is 'label'.}
+
+\item{sampling_freq}{The time frequency for sampling, including the unit (e.g., '16 day').
+If NULL, the function will infer the frequency. This parameter is only used if a formula is provided.}
+
+\item{...}{Additional arguments passed to the GAM function.}
+}
+\value{
+A 'knn1_twdtw' model containing the trained model information and the data used.
+}
+\description{
+This function prepares a KNN-1 model with the Time Warp Dynamic Time Warping (TWDTW) algorithm.
+If a formula is provided, the training samples are resampled using Generalized Additive Models (GAM).
+}
+\details{
+If \code{formula} is NULL, the KNN-1 model will retain all training samples. If a formula is passed (e.g., \code{band ~ \link[mgcv]{s}(time)}),
+then samples of the same label (land cover class) will be resampled using GAM.
+Resampling can significantly reduce prediction processing time.
+}
+\examples{
+
+# Example will be added once the function is properly implemented and tested.
+
+}
diff --git a/man/plot.knn1_twdtw.Rd b/man/plot.knn1_twdtw.Rd
new file mode 100644
index 0000000..5f6c297
--- /dev/null
+++ b/man/plot.knn1_twdtw.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/plot_patterns.R
+\name{plot.knn1_twdtw}
+\alias{plot.knn1_twdtw}
+\title{Plot Patterns from Time Series Data}
+\usage{
+\method{plot}{knn1_twdtw}(x, n = 12, ...)
+}
+\arguments{
+\item{x}{A model of class \code{"knn1_twdtw"}.}
+
+\item{n}{An integer specifying the number of patterns to plot. Default is 12.}
+
+\item{...}{Additional arguments. Currently not used.}
+}
+\value{
+A \code{\link[ggplot2]{ggplot}} object displaying the time series patterns.
+}
+\description{
+This function visualizes time series data patterns from the \code{"knn1_twdtw"} model.
+It produces a multi-faceted plot,
+where each facet represents a different time series pattern from the model's data.
+Within each facet, different attributes (e.g., bands or indices) are plotted as
+distinct lines, differentiated by color.
+}
+\details{
+The function is designed for visual inspection of time series patterns from the model.
+Each data frame in the model's data represents a time series. Columns within the data frame
+correspond to different attributes (e.g., bands or indices), while rows represent time points.
+The facet title is derived from the name of each time series in the model's data.
+}
diff --git a/man/plot_patterns.Rd b/man/plot_patterns.Rd
deleted file mode 100644
index 97e24d6..0000000
--- a/man/plot_patterns.Rd
+++ /dev/null
@@ -1,26 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/plot_patterns.R
-\name{plot_patterns}
-\alias{plot_patterns}
-\title{Plot Patterns from Time Series Data}
-\usage{
-plot_patterns(x, ...)
-}
-\arguments{
-\item{x}{A list where each element is a data.frame representing a time series.
-Each data.frame should have the same number of rows and columns,
-with columns representing different attributes (e.g., bands or indices)
-and rows representing time points.
-The name of each element in the list will be used as the facet title.}
-
-\item{...}{Not used.}
-}
-\value{
-A ggplot object displaying the time series patterns.
-}
-\description{
-This function takes a list of time series data and creates a multi-faceted plot
-where each facet corresponds to a different time series from the list.
-Within each facet, different attributes (columns of the time series) are
-plotted as lines with different colors.
-}
diff --git a/man/predict.knn1_twdtw.Rd b/man/predict.knn1_twdtw.Rd
new file mode 100644
index 0000000..605217e
--- /dev/null
+++ b/man/predict.knn1_twdtw.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/predict.R
+\name{predict.knn1_twdtw}
+\alias{predict.knn1_twdtw}
+\title{Predict using the knn1_twdtw model}
+\usage{
+\method{predict}{knn1_twdtw}(object, newdata, ...)
+}
+\arguments{
+\item{object}{A \code{knn1_twdtw} model object generated by the \code{knn1_twdtw} function.}
+
+\item{newdata}{A data frame or similar object containing the new observations
+(time series data) to be predicted.}
+
+\item{...}{Additional arguments passed to the \link[twdtw]{twdtw} function.}
+}
+\value{
+A vector of predicted classes for the \code{newdata}.
+}
+\description{
+This function predicts the classes of new data using the Time Warped Dynamic Time Warping (TWDTW)
+method with a 1-nearest neighbor approach. The prediction is based on the minimum TWDTW distance
+to the known patterns stored in the \code{knn1_twdtw} model.
+}
diff --git a/man/prepare_time_series.Rd b/man/prepare_time_series.Rd
new file mode 100644
index 0000000..dfce70e
--- /dev/null
+++ b/man/prepare_time_series.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/prepare_time_series.R
+\name{prepare_time_series}
+\alias{prepare_time_series}
+\title{Prepare a Time Series Tibble from a 2D stars Object with Bands and Time Attributes}
+\usage{
+prepare_time_series(x)
+}
+\arguments{
+\item{x}{A data frame derived from a stars object containing time series data in wide format.
+The column names should adhere to one of the following formats: "band.YYYY.MM.DD", "XYYYY.MM.DD.band", or "YYYY.MM.DD.band".}
+}
+\value{
+A nested tibble in wide format. Each row of the tibble corresponds to a unique 'ts_id' that maintains the order from the original stars object.
+The nested structure contains observations (time series) for each 'ts_id', including the 'time' of each observation, and individual bands are presented as separate columns.
+}
+\description{
+This function reshapes a data frame, which has been converted from a stars object, into a nested wide tibble format.
+The stars object conversion often results in columns named in formats like "band.YYYY.MM.DD", "XYYYY.MM.DD.band", or "YYYY.MM.DD.band".
+}
diff --git a/man/shift_dates.Rd b/man/shift_dates.Rd
index 87d58c1..9af6b7e 100644
--- a/man/shift_dates.Rd
+++ b/man/shift_dates.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils.R
+% Please edit documentation in R/shift_dates.R
 \name{shift_dates}
 \alias{shift_dates}
 \title{Shift Dates to Start on a Specified Origin Year}
diff --git a/tests/testthat/test-twdtw_classify.R b/tests/testthat/test-twdtw_classify.R
index f5c0c04..6beb6a1 100644
--- a/tests/testthat/test-twdtw_classify.R
+++ b/tests/testthat/test-twdtw_classify.R
@@ -1,8 +1,5 @@
-library(stars)
-library(stringr)
-
 # Read training samples
-samples <- st_read(system.file("mato_grosso_brazil/samples.gpkg", package = "dtwSat"))
+samples <- st_read(system.file("mato_grosso_brazil/samples.gpkg", package = "dtwSat"), quiet = TRUE)
 
 # Satellite image time sereis files
 tif_files <- system.file("mato_grosso_brazil", package = "dtwSat") |>
@@ -10,18 +7,23 @@ tif_files <- system.file("mato_grosso_brazil", package = "dtwSat") |>
 
 # The acquisition date is in the file name are not the true acquisition date of each pixel
 # MOD13Q1 is a 16-day composite product, so the acquisition date is the first day of the 16-day period
-acquisition_date <- as.Date(str_extract(tif_files, "[0-9]{8}"), format = "%Y%m%d")
-
-# Read the data as a stars object, setting time as a dimension and band as attribute
-dc <- read_stars(tif_files, proxy = FALSE, along = list(time = acquisition_date)) |>
-  st_set_dimensions(3, c("EVI", "NDVI", "RED", "BLUE", "NIR", "MIR", "DOY")) |>
-  split("band")
+acquisition_date <- as.Date(regmatches(tif_files, regexpr("[0-9]{8}", tif_files)), format = "%Y%m%d")
 
-# Remove the DOY band - this will be supported int the future
-dc <- dc[c("EVI", "NDVI", "RED", "BLUE", "NIR", "MIR")]
+# Read the data as a stars object setting the time/date for each observation using along
+# This will prodcue a 4D array
+# Split 'band' and 'time' dimensions to create a 2D array
+dc <- read_stars(tif_files, proxy = FALSE, along = list(time = acquisition_date), RasterIO = list(bands = 1:6)) |>
+  st_set_dimensions(3, c("EVI", "NDVI", "RED", "BLUE", "NIR", "MIR")) |>
+  split(c("band")) |>
+  split(c("time"))
 
 # Get temporal patters
-ts_patterns <- create_patterns(x = dc, y = samples)
+m <- knn1_twdtw(x = dc, y = samples, formula = band ~ s(time), sampling_freq = 16)
 
 # Visualize patterns
-plot_patterns(ts_patterns)
+plot(m)
+
+# Classify stars
+system.time(lu <- predict(dc, model = m, drop_dimensions = TRUE, cycle_length = 'year', time_scale = 'day', time_weight = c(steepness = 0.1, midpoint = 50)))
+
+# write_stars(lu, "~/Downloads/lu.tif")
\ No newline at end of file