C:40A1p3ea03fe1bb5-started.R
# Prepare example data: Convert first 3 columns of iris dataset to long format and split
dt_split <- w2l_split(data = iris, cols2l = 1:3)
# dt_split is now a list containing 3 data tables for Sepal.Length, Sepal.Width, and Petal.Length
# Example 1: Single cross-validation (no repeats)
split_cv(
split_dt = dt_split, # Input list of split data
v = 3, # Set 3-fold cross-validation
repeats = 1 # Perform cross-validation once (no repeats)
)
#> $Sepal.Length
#> splits id train validate
#> <list> <char> <list> <list>
#> 1: <vfold_split[100x50x150x3]> Fold1 <data.table[100x3]> <data.table[50x3]>
#> 2: <vfold_split[100x50x150x3]> Fold2 <data.table[100x3]> <data.table[50x3]>
#> 3: <vfold_split[100x50x150x3]> Fold3 <data.table[100x3]> <data.table[50x3]>
#>
#> $Sepal.Width
#> splits id train validate
#> <list> <char> <list> <list>
#> 1: <vfold_split[100x50x150x3]> Fold1 <data.table[100x3]> <data.table[50x3]>
#> 2: <vfold_split[100x50x150x3]> Fold2 <data.table[100x3]> <data.table[50x3]>
#> 3: <vfold_split[100x50x150x3]> Fold3 <data.table[100x3]> <data.table[50x3]>
#>
#> $Petal.Length
#> splits id train validate
#> <list> <char> <list> <list>
#> 1: <vfold_split[100x50x150x3]> Fold1 <data.table[100x3]> <data.table[50x3]>
#> 2: <vfold_split[100x50x150x3]> Fold2 <data.table[100x3]> <data.table[50x3]>
#> 3: <vfold_split[100x50x150x3]> Fold3 <data.table[100x3]> <data.table[50x3]>
# Returns a list where each element contains:
# - splits: rsample split objects
# - id: fold numbers (Fold1, Fold2, Fold3)
# - train: training set data
# - validate: validation set data
# Example 2: Repeated cross-validation
split_cv(
split_dt = dt_split, # Input list of split data
v = 3, # Set 3-fold cross-validation
repeats = 2 # Perform cross-validation twice
)
#> $Sepal.Length
#> splits id id2 train
#> <list> <char> <char> <list>
#> 1: <vfold_split[100x50x150x3]> Repeat1 Fold1 <data.table[100x3]>
#> 2: <vfold_split[100x50x150x3]> Repeat1 Fold2 <data.table[100x3]>
#> 3: <vfold_split[100x50x150x3]> Repeat1 Fold3 <data.table[100x3]>
#> 4: <vfold_split[100x50x150x3]> Repeat2 Fold1 <data.table[100x3]>
#> 5: <vfold_split[100x50x150x3]> Repeat2 Fold2 <data.table[100x3]>
#> 6: <vfold_split[100x50x150x3]> Repeat2 Fold3 <data.table[100x3]>
#> validate
#> <list>
#> 1: <data.table[50x3]>
#> 2: <data.table[50x3]>
#> 3: <data.table[50x3]>
#> 4: <data.table[50x3]>
#> 5: <data.table[50x3]>
#> 6: <data.table[50x3]>
#>
#> $Sepal.Width
#> splits id id2 train
#> <list> <char> <char> <list>
#> 1: <vfold_split[100x50x150x3]> Repeat1 Fold1 <data.table[100x3]>
#> 2: <vfold_split[100x50x150x3]> Repeat1 Fold2 <data.table[100x3]>
#> 3: <vfold_split[100x50x150x3]> Repeat1 Fold3 <data.table[100x3]>
#> 4: <vfold_split[100x50x150x3]> Repeat2 Fold1 <data.table[100x3]>
#> 5: <vfold_split[100x50x150x3]> Repeat2 Fold2 <data.table[100x3]>
#> 6: <vfold_split[100x50x150x3]> Repeat2 Fold3 <data.table[100x3]>
#> validate
#> <list>
#> 1: <data.table[50x3]>
#> 2: <data.table[50x3]>
#> 3: <data.table[50x3]>
#> 4: <data.table[50x3]>
#> 5: <data.table[50x3]>
#> 6: <data.table[50x3]>
#>
#> $Petal.Length
#> splits id id2 train
#> <list> <char> <char> <list>
#> 1: <vfold_split[100x50x150x3]> Repeat1 Fold1 <data.table[100x3]>
#> 2: <vfold_split[100x50x150x3]> Repeat1 Fold2 <data.table[100x3]>
#> 3: <vfold_split[100x50x150x3]> Repeat1 Fold3 <data.table[100x3]>
#> 4: <vfold_split[100x50x150x3]> Repeat2 Fold1 <data.table[100x3]>
#> 5: <vfold_split[100x50x150x3]> Repeat2 Fold2 <data.table[100x3]>
#> 6: <vfold_split[100x50x150x3]> Repeat2 Fold3 <data.table[100x3]>
#> validate
#> <list>
#> 1: <data.table[50x3]>
#> 2: <data.table[50x3]>
#> 3: <data.table[50x3]>
#> 4: <data.table[50x3]>
#> 5: <data.table[50x3]>
#> 6: <data.table[50x3]>
# Returns a list where each element contains:
# - splits: rsample split objects
# - id: repeat numbers (Repeat1, Repeat2)
# - id2: fold numbers (Fold1, Fold2, Fold3)
# - train: training set data
# - validate: validation set data
C:40A1p3ea03fe1bb5-started.R
# Example data preparation: Define column names for combination
col_names <- c("Sepal.Length", "Sepal.Width", "Petal.Length")
# Example 1: Basic column-to-pairs nesting with custom separator
c2p_nest(
iris, # Input iris dataset
cols2bind = col_names, # Columns to be combined as pairs
pairs_n = 2, # Create pairs of 2 columns
sep = "&" # Custom separator for pair names
)
#> pairs data
#> <char> <list>
#> 1: Sepal.Length&Sepal.Width <data.table[150x4]>
#> 2: Sepal.Length&Petal.Length <data.table[150x4]>
#> 3: Sepal.Width&Petal.Length <data.table[150x4]>
# Returns a nested data.table where:
# - pairs: combined column names (e.g., "Sepal.Length&Sepal.Width")
# - data: list column containing data.tables with value1, value2 columns
# Example 2: Column-to-pairs nesting with numeric indices and grouping
c2p_nest(
iris, # Input iris dataset
cols2bind = 1:3, # First 3 columns to be combined
pairs_n = 2, # Create pairs of 2 columns
by = 5 # Group by 5th column (Species)
)
#> pairs Species data
#> <char> <fctr> <list>
#> 1: Sepal.Length-Sepal.Width setosa <data.table[50x3]>
#> 2: Sepal.Length-Sepal.Width versicolor <data.table[50x3]>
#> 3: Sepal.Length-Sepal.Width virginica <data.table[50x3]>
#> 4: Sepal.Length-Petal.Length setosa <data.table[50x3]>
#> 5: Sepal.Length-Petal.Length versicolor <data.table[50x3]>
#> 6: Sepal.Length-Petal.Length virginica <data.table[50x3]>
#> 7: Sepal.Width-Petal.Length setosa <data.table[50x3]>
#> 8: Sepal.Width-Petal.Length versicolor <data.table[50x3]>
#> 9: Sepal.Width-Petal.Length virginica <data.table[50x3]>
# Returns a nested data.table where:
# - pairs: combined column names
# - Species: grouping variable
# - data: list column containing data.tables grouped by Species
C:40A1p3ea03fe1bb5-started.R
# Example 1: Row-to-pairs nesting with column names
r2p_nest(
mtcars, # Input mtcars dataset
rows2bind = "cyl", # Column to be used as row values
by = c("hp", "drat", "wt") # Columns to be transformed into pairs
)
#> name data
#> <fctr> <list>
#> 1: hp <data.table[32x12]>
#> 2: drat <data.table[32x12]>
#> 3: wt <data.table[32x12]>
# Returns a nested data.table where:
# - name: variable names (hp, drat, wt)
# - data: list column containing data.tables with rows grouped by cyl values
# Example 2: Row-to-pairs nesting with numeric indices
r2p_nest(
mtcars, # Input mtcars dataset
rows2bind = 2, # Use 2nd column (cyl) as row values
by = 4:6 # Use columns 4-6 (hp, drat, wt) for pairs
)
#> name data
#> <fctr> <list>
#> 1: hp <data.table[32x12]>
#> 2: drat <data.table[32x12]>
#> 3: wt <data.table[32x12]>
# Returns a nested data.table where:
# - name: variable names from columns 4-6
# - data: list column containing data.tables with rows grouped by cyl values
C:40A1p3ea03fe1bb5-started.R
# Example 1: Basic nested data export workflow
# Step 1: Create nested data structure
dt_nest <- w2l_nest(
data = iris, # Input iris dataset
cols2l = 1:2, # Columns to be nested
by = "Species" # Grouping variable
)
# Step 2: Export nested data to files
export_nest(
nest_dt = dt_nest, # Input nested data.table
nest_col = "data", # Column containing nested data
group_cols = c("name", "Species") # Columns to create directory structure
)
#> [1] 6
# Returns the number of files created
# Creates directory structure: tempdir()/name/Species/data.txt
# Check exported files
list.files(
path = tempdir(), # Default export directory
pattern = "txt", # File type pattern to search
recursive = TRUE # Search in subdirectories
)
#> [1] "Sepal.Length/setosa/data.txt" "Sepal.Length/versicolor/data.txt"
#> [3] "Sepal.Length/virginica/data.txt" "Sepal.Width/setosa/data.txt"
#> [5] "Sepal.Width/versicolor/data.txt" "Sepal.Width/virginica/data.txt"
# Returns list of created files and their paths
# Clean up exported files
files <- list.files(
path = tempdir(), # Default export directory
pattern = "txt", # File type pattern to search
recursive = TRUE, # Search in subdirectories
full.names = TRUE # Return full file paths
)
file.remove(files) # Remove all exported files
#> [1] TRUE TRUE TRUE TRUE TRUE TRUE
C:40A1p3ea03fe1bb5-started.R
# Example: Export split data to files
# Step 1: Create split data structure
dt_split <- w2l_split(
data = iris, # Input iris dataset
cols2l = 1:2, # Columns to be split
by = "Species" # Grouping variable
)
# Step 2: Export split data to files
export_list(
split_dt = dt_split # Input list of data.tables
)
#> [1] 6
# Returns the number of files created
# Files are saved in tempdir() with .txt extension
# Check exported files
list.files(
path = tempdir(), # Default export directory
pattern = "txt", # File type pattern to search
recursive = TRUE # Search in subdirectories
)
#> [1] "Sepal.Length_setosa.txt" "Sepal.Length_versicolor.txt"
#> [3] "Sepal.Length_virginica.txt" "Sepal.Width_setosa.txt"
#> [5] "Sepal.Width_versicolor.txt" "Sepal.Width_virginica.txt"
# Clean up exported files
files <- list.files(
path = tempdir(), # Default export directory
pattern = "txt", # File type pattern to search
recursive = TRUE, # Search in subdirectories
full.names = TRUE # Return full file paths
)
file.remove(files) # Remove all exported files
#> [1] TRUE TRUE TRUE TRUE TRUE TRUE
C:40A1p3ea03fe1bb5-started.R
head(fires())
#> Location Tag Date Entry Exit Ent Wt Ext Wt Consumed Weight
#> <int> <int> <char> <char> <char> <num> <num> <num> <num>
#> 1: 101 35877 2024-10-07 14:15:39 14:18:02 0.678 0.632 0.046 67.6
#> 2: 101 35873 2024-10-07 14:18:03 14:23:05 0.632 0.384 0.248 60.8
#> 3: 101 35878 2024-10-07 14:23:15 14:28:45 0.670 0.469 0.201 70.8
#> 4: 101 35855 2024-10-07 14:29:05 14:34:29 0.755 0.634 0.121 51.2
#> 5: 101 35877 2024-10-07 14:34:30 14:34:37 0.634 0.634 0.000 0.0
#> 6: 101 35853 2024-10-07 14:34:38 14:36:26 0.634 0.634 0.000 88.6
#> Topup Amount
#> <num>
#> 1: 0.286
#> 2: 0.000
#> 3: 0.286
#> 4: 0.286
#> 5: 0.000
#> 6: 0.000
C:40A1p3ea03fe1bb5-started.R
head(nedaps())
#> animal_number lifenumber responder location visit_time duration
#> <int> <lgcl> <int> <int> <POSc> <int>
#> 1: 10115497 NA 15497 101 2024-09-06 20:22:51 3
#> 2: 10115967 NA 15967 101 2024-09-06 20:22:54 65
#> 3: 10115983 NA 15983 101 2024-09-06 20:23:59 2
#> 4: 10115967 NA 15967 101 2024-09-06 20:24:01 11
#> 5: 10115983 NA 15983 101 2024-09-06 20:24:12 2
#> 6: 10115967 NA 15967 101 2024-09-06 20:24:14 33
#> state weight feed_intake
#> <int> <int> <int>
#> 1: 0 46500 0
#> 2: 0 22000 17
#> 3: 0 33000 0
#> 4: 0 33500 0
#> 5: 0 35500 0
#> 6: 0 31000 0
C:40A1p3ea03fe1bb5-started.R
# Example 1: Create nested data structures
# Create single nested column
df_nest1 <- iris |>
dplyr::group_nest(Species) # Group and nest by Species
# Create multiple nested columns
df_nest2 <- iris |>
dplyr::group_nest(Species) |> # Group and nest by Species
dplyr::mutate(
data2 = purrr::map( # Create second nested column
data,
dplyr::mutate,
c = 2
)
)
# Example 2: Convert nested structures
# Convert data frame to data table
convert_nest(
df_nest1, # Input nested data frame
to = "dt" # Convert to data.table
)
#> Species data
#> <fctr> <list>
#> 1: setosa <data.table[50x4]>
#> 2: versicolor <data.table[50x4]>
#> 3: virginica <data.table[50x4]>
# Convert specific nested columns
convert_nest(
df_nest2, # Input nested data frame
to = "dt", # Convert to data.table
nest_cols = "data" # Only convert 'data' column
)
#> Species data data2
#> <fctr> <list> <list>
#> 1: setosa <data.table[50x4]> <tbl_df[50x5]>
#> 2: versicolor <data.table[50x4]> <tbl_df[50x5]>
#> 3: virginica <data.table[50x4]> <tbl_df[50x5]>
# Example 3: Convert data table to data frame
dt_nest <- mintyr::w2l_nest(
data = iris, # Input dataset
cols2l = 1:2 # Columns to nest
)
convert_nest(
dt_nest, # Input nested data table
to = "df" # Convert to data frame
)
#> # A tibble: 2 × 2
#> name data
#> <fct> <list>
#> 1 Sepal.Length <tibble [150 × 4]>
#> 2 Sepal.Width <tibble [150 × 4]>
C:40A1p3ea03fe1bb5-started.R
# Example: Path segment extraction demonstrations
# Setup test paths
paths <- c(
"C:/home/user/documents", # Windows style path
"/var/log/system", # Unix system path
"/usr/local/bin" # Unix binary path
)
# Example 1: Extract first segment
get_path_segment(
paths, # Input paths
1 # Get first segment
)
#> [1] "home" "var" "usr"
# Returns: c("home", "var", "usr")
# Example 2: Extract second-to-last segment
get_path_segment(
paths, # Input paths
-2 # Get second-to-last segment
)
#> [1] "user" "log" "local"
# Returns: c("user", "log", "local")
# Example 3: Extract from first to last segment
get_path_segment(
paths, # Input paths
c(1,-1) # Range from first to last
)
#> [1] "home/user/documents" "var/log/system" "usr/local/bin"
# Returns full paths without drive letters
# Example 4: Extract first three segments
get_path_segment(
paths, # Input paths
c(1,3) # Range from first to third
)
#> [1] "home/user/documents" "var/log/system" "usr/local/bin"
# Returns: c("home/user/documents", "var/log/system", "usr/local/bin")
# Example 5: Extract last two segments (reverse order)
get_path_segment(
paths, # Input paths
c(-1,-2) # Range from last to second-to-last
)
#> [1] "user/documents" "log/system" "local/bin"
# Returns: c("documents/user", "system/log", "bin/local")
# Example 6: Extract first two segments
get_path_segment(
paths, # Input paths
c(1,2) # Range from first to second
)
#> [1] "home/user" "var/log" "usr/local"
# Returns: c("home/user", "var/log", "usr/local")
C:40A1p3ea03fe1bb5-started.R
# Example: Number formatting demonstrations
# Setup test data
dt <- data.table::data.table(
a = c(0.1234, 0.5678), # Numeric column 1
b = c(0.2345, 0.6789), # Numeric column 2
c = c("text1", "text2") # Text column
)
# Example 1: Format all numeric columns
format_digits(
dt, # Input data table
digits = 2 # Round to 2 decimal places
)
#> a b c
#> <char> <char> <char>
#> 1: 0.12 0.23 text1
#> 2: 0.57 0.68 text2
# Example 2: Format specific column as percentage
format_digits(
dt, # Input data table
cols = c("a"), # Only format column 'a'
digits = 2, # Round to 2 decimal places
percentage = TRUE # Convert to percentage
)
#> a b c
#> <char> <num> <char>
#> 1: 12.34% 0.2345 text1
#> 2: 56.78% 0.6789 text2
C:40A1p3ea03fe1bb5-started.R
# Get path to an example file
mintyr_example("csv_test1.csv")
#> [1] "C:/Users/Dell/AppData/Local/Temp/RtmpG40A1p/Rinst3ea045257f3c/mintyr/extdata/csv_test1.csv"
C:40A1p3ea03fe1bb5-started.R
# List all example files
mintyr_examples()
#> [1] "csv_test1.csv" "csv_test2.csv" "xlsx_test1.xlsx" "xlsx_test2.xlsx"
C:40A1p3ea03fe1bb5-started.R
# Example: Excel file import demonstrations
# Setup test files
xlsx_files <- mintyr_example(
mintyr_examples("xlsx_test") # Get example Excel files
)
# Example 1: Import and combine all sheets from all files
import_xlsx(
xlsx_files, # Input Excel file paths
rbind = TRUE # Combine all sheets into one data.table
)
#> excel_name sheet_name col1 col2 col3
#> <char> <char> <num> <char> <lgcl>
#> 1: xlsx_test1 Sheet1 4 d FALSE
#> 2: xlsx_test1 Sheet1 5 f TRUE
#> 3: xlsx_test1 Sheet1 6 e TRUE
#> 4: xlsx_test1 Sheet2 1 a TRUE
#> 5: xlsx_test1 Sheet2 2 b FALSE
#> 6: xlsx_test1 Sheet2 3 c TRUE
#> 7: xlsx_test2 Sheet1 15 o FALSE
#> 8: xlsx_test2 Sheet1 16 p TRUE
#> 9: xlsx_test2 Sheet1 17 q FALSE
#> 10: xlsx_test2 a 7 g FALSE
#> 11: xlsx_test2 a 9 h TRUE
#> 12: xlsx_test2 a 8 i FALSE
#> 13: xlsx_test2 b 10 J FALSE
#> 14: xlsx_test2 b 11 K TRUE
#> 15: xlsx_test2 b 12 L FALSE
# Example 2: Import specific sheets separately
import_xlsx(
xlsx_files, # Input Excel file paths
rbind = FALSE, # Keep sheets as separate data.tables
sheet = 2 # Only import first sheet
)
#> $xlsx_test1_Sheet2
#> col1 col2 col3
#> <num> <char> <lgcl>
#> 1: 1 a TRUE
#> 2: 2 b FALSE
#> 3: 3 c TRUE
#>
#> $xlsx_test2_a
#> col1 col2 col3
#> <num> <char> <lgcl>
#> 1: 7 g FALSE
#> 2: 9 h TRUE
#> 3: 8 i FALSE
C:40A1p3ea03fe1bb5-started.R
# Example: CSV file import demonstrations
# Setup test files
csv_files <- mintyr_example(
mintyr_examples("csv_test") # Get example CSV files
)
# Example 1: Import and combine CSV files using data.table
import_csv(
csv_files, # Input CSV file paths
package = "data.table", # Use data.table for reading
rbind = TRUE, # Combine all files into one data.table
rbind_label = "_file" # Column name for file source
)
#> _file col1 col2 col3
#> <char> <int> <char> <lgcl>
#> 1: csv_test1 4 d FALSE
#> 2: csv_test1 5 f TRUE
#> 3: csv_test1 6 e TRUE
#> 4: csv_test2 15 o FALSE
#> 5: csv_test2 16 p TRUE
#> 6: csv_test2 17 q FALSE
# Example 2: Import files separately using arrow
import_csv(
csv_files, # Input CSV file paths
package = "arrow", # Use arrow for reading
rbind = FALSE # Keep files as separate data.tables
)
#> $csv_test1
#> # A tibble: 3 × 3
#> col1 col2 col3
#> <int> <chr> <lgl>
#> 1 4 d FALSE
#> 2 5 f TRUE
#> 3 6 e TRUE
#>
#> $csv_test2
#> # A tibble: 3 × 3
#> col1 col2 col3
#> <int> <chr> <lgl>
#> 1 15 o FALSE
#> 2 16 p TRUE
#> 3 17 q FALSE
C:40A1p3ea03fe1bb5-started.R
# Example: File path processing demonstrations
# Setup test files
xlsx_files <- mintyr_example(
mintyr_examples("xlsx_test") # Get example Excel files
)
# Example 1: Extract filenames without extensions
get_filename(
xlsx_files, # Input file paths
rm_extension = TRUE, # Remove file extensions
rm_path = TRUE # Remove directory paths
)
#> [1] "xlsx_test1" "xlsx_test2"
# Example 2: Keep file extensions
get_filename(
xlsx_files, # Input file paths
rm_extension = FALSE, # Keep file extensions
rm_path = TRUE # Remove directory paths
)
#> [1] "xlsx_test1.xlsx" "xlsx_test2.xlsx"
# Example 3: Keep full paths without extensions
get_filename(
xlsx_files, # Input file paths
rm_extension = TRUE, # Remove file extensions
rm_path = FALSE # Keep directory paths
)
#> [1] "C:/Users/Dell/AppData/Local/Temp/RtmpG40A1p/Rinst3ea045257f3c/mintyr/extdata/xlsx_test1"
#> [2] "C:/Users/Dell/AppData/Local/Temp/RtmpG40A1p/Rinst3ea045257f3c/mintyr/extdata/xlsx_test2"
C:40A1p3ea03fe1bb5-started.R
# Example: Wide to long format nesting demonstrations
# Example 1: Basic nesting by group
w2l_nest(
data = iris, # Input dataset
by = "Species" # Group by Species column
)
#> Species data
#> <fctr> <list>
#> 1: setosa <data.table[50x4]>
#> 2: versicolor <data.table[50x4]>
#> 3: virginica <data.table[50x4]>
# Example 2: Nest specific columns with numeric indices
w2l_nest(
data = iris, # Input dataset
cols2l = 1:4, # Select first 4 columns to nest
by = "Species" # Group by Species column
)
#> name Species data
#> <fctr> <fctr> <list>
#> 1: Sepal.Length setosa <data.table[50x1]>
#> 2: Sepal.Length versicolor <data.table[50x1]>
#> 3: Sepal.Length virginica <data.table[50x1]>
#> 4: Sepal.Width setosa <data.table[50x1]>
#> 5: Sepal.Width versicolor <data.table[50x1]>
#> 6: Sepal.Width virginica <data.table[50x1]>
#> 7: Petal.Length setosa <data.table[50x1]>
#> 8: Petal.Length versicolor <data.table[50x1]>
#> 9: Petal.Length virginica <data.table[50x1]>
#> 10: Petal.Width setosa <data.table[50x1]>
#> 11: Petal.Width versicolor <data.table[50x1]>
#> 12: Petal.Width virginica <data.table[50x1]>
# Example 3: Nest specific columns with column names
w2l_nest(
data = iris, # Input dataset
cols2l = c("Sepal.Length", # Select columns by name
"Sepal.Width",
"Petal.Length"),
by = 5 # Group by column index 5 (Species)
)
#> name Species data
#> <fctr> <fctr> <list>
#> 1: Sepal.Length setosa <data.table[50x2]>
#> 2: Sepal.Length versicolor <data.table[50x2]>
#> 3: Sepal.Length virginica <data.table[50x2]>
#> 4: Sepal.Width setosa <data.table[50x2]>
#> 5: Sepal.Width versicolor <data.table[50x2]>
#> 6: Sepal.Width virginica <data.table[50x2]>
#> 7: Petal.Length setosa <data.table[50x2]>
#> 8: Petal.Length versicolor <data.table[50x2]>
#> 9: Petal.Length virginica <data.table[50x2]>
# Returns similar structure to Example 2
C:40A1p3ea03fe1bb5-started.R
# Example: Wide to long format splitting demonstrations
# Example 1: Basic splitting by Species
w2l_split(
data = iris, # Input dataset
by = "Species" # Split by Species column
) |>
lapply(head) # Show first 6 rows of each split
#> $setosa
#> Sepal.Length Sepal.Width Petal.Length Petal.Width
#> <num> <num> <num> <num>
#> 1: 5.1 3.5 1.4 0.2
#> 2: 4.9 3.0 1.4 0.2
#> 3: 4.7 3.2 1.3 0.2
#> 4: 4.6 3.1 1.5 0.2
#> 5: 5.0 3.6 1.4 0.2
#> 6: 5.4 3.9 1.7 0.4
#>
#> $versicolor
#> Sepal.Length Sepal.Width Petal.Length Petal.Width
#> <num> <num> <num> <num>
#> 1: 7.0 3.2 4.7 1.4
#> 2: 6.4 3.2 4.5 1.5
#> 3: 6.9 3.1 4.9 1.5
#> 4: 5.5 2.3 4.0 1.3
#> 5: 6.5 2.8 4.6 1.5
#> 6: 5.7 2.8 4.5 1.3
#>
#> $virginica
#> Sepal.Length Sepal.Width Petal.Length Petal.Width
#> <num> <num> <num> <num>
#> 1: 6.3 3.3 6.0 2.5
#> 2: 5.8 2.7 5.1 1.9
#> 3: 7.1 3.0 5.9 2.1
#> 4: 6.3 2.9 5.6 1.8
#> 5: 6.5 3.0 5.8 2.2
#> 6: 7.6 3.0 6.6 2.1
# Example 2: Split specific columns using numeric indices
w2l_split(
data = iris, # Input dataset
cols2l = 1:3, # Select first 3 columns to split
by = 5 # Split by column index 5 (Species)
) |>
lapply(head) # Show first 6 rows of each split
#> $Sepal.Length_setosa
#> Petal.Width value
#> <num> <num>
#> 1: 0.2 5.1
#> 2: 0.2 4.9
#> 3: 0.2 4.7
#> 4: 0.2 4.6
#> 5: 0.2 5.0
#> 6: 0.4 5.4
#>
#> $Sepal.Length_versicolor
#> Petal.Width value
#> <num> <num>
#> 1: 1.4 7.0
#> 2: 1.5 6.4
#> 3: 1.5 6.9
#> 4: 1.3 5.5
#> 5: 1.5 6.5
#> 6: 1.3 5.7
#>
#> $Sepal.Length_virginica
#> Petal.Width value
#> <num> <num>
#> 1: 2.5 6.3
#> 2: 1.9 5.8
#> 3: 2.1 7.1
#> 4: 1.8 6.3
#> 5: 2.2 6.5
#> 6: 2.1 7.6
#>
#> $Sepal.Width_setosa
#> Petal.Width value
#> <num> <num>
#> 1: 0.2 3.5
#> 2: 0.2 3.0
#> 3: 0.2 3.2
#> 4: 0.2 3.1
#> 5: 0.2 3.6
#> 6: 0.4 3.9
#>
#> $Sepal.Width_versicolor
#> Petal.Width value
#> <num> <num>
#> 1: 1.4 3.2
#> 2: 1.5 3.2
#> 3: 1.5 3.1
#> 4: 1.3 2.3
#> 5: 1.5 2.8
#> 6: 1.3 2.8
#>
#> $Sepal.Width_virginica
#> Petal.Width value
#> <num> <num>
#> 1: 2.5 3.3
#> 2: 1.9 2.7
#> 3: 2.1 3.0
#> 4: 1.8 2.9
#> 5: 2.2 3.0
#> 6: 2.1 3.0
#>
#> $Petal.Length_setosa
#> Petal.Width value
#> <num> <num>
#> 1: 0.2 1.4
#> 2: 0.2 1.4
#> 3: 0.2 1.3
#> 4: 0.2 1.5
#> 5: 0.2 1.4
#> 6: 0.4 1.7
#>
#> $Petal.Length_versicolor
#> Petal.Width value
#> <num> <num>
#> 1: 1.4 4.7
#> 2: 1.5 4.5
#> 3: 1.5 4.9
#> 4: 1.3 4.0
#> 5: 1.5 4.6
#> 6: 1.3 4.5
#>
#> $Petal.Length_virginica
#> Petal.Width value
#> <num> <num>
#> 1: 2.5 6.0
#> 2: 1.9 5.1
#> 3: 2.1 5.9
#> 4: 1.8 5.6
#> 5: 2.2 5.8
#> 6: 2.1 6.6
# Example 3: Split specific columns using column names
list_res <- w2l_split(
data = iris, # Input dataset
cols2l = c("Sepal.Length", # Select columns by name
"Sepal.Width"),
by = "Species" # Split by Species column
)
lapply(list_res, head) # Show first 6 rows of each split
#> $Sepal.Length_setosa
#> Petal.Length Petal.Width value
#> <num> <num> <num>
#> 1: 1.4 0.2 5.1
#> 2: 1.4 0.2 4.9
#> 3: 1.3 0.2 4.7
#> 4: 1.5 0.2 4.6
#> 5: 1.4 0.2 5.0
#> 6: 1.7 0.4 5.4
#>
#> $Sepal.Length_versicolor
#> Petal.Length Petal.Width value
#> <num> <num> <num>
#> 1: 4.7 1.4 7.0
#> 2: 4.5 1.5 6.4
#> 3: 4.9 1.5 6.9
#> 4: 4.0 1.3 5.5
#> 5: 4.6 1.5 6.5
#> 6: 4.5 1.3 5.7
#>
#> $Sepal.Length_virginica
#> Petal.Length Petal.Width value
#> <num> <num> <num>
#> 1: 6.0 2.5 6.3
#> 2: 5.1 1.9 5.8
#> 3: 5.9 2.1 7.1
#> 4: 5.6 1.8 6.3
#> 5: 5.8 2.2 6.5
#> 6: 6.6 2.1 7.6
#>
#> $Sepal.Width_setosa
#> Petal.Length Petal.Width value
#> <num> <num> <num>
#> 1: 1.4 0.2 3.5
#> 2: 1.4 0.2 3.0
#> 3: 1.3 0.2 3.2
#> 4: 1.5 0.2 3.1
#> 5: 1.4 0.2 3.6
#> 6: 1.7 0.4 3.9
#>
#> $Sepal.Width_versicolor
#> Petal.Length Petal.Width value
#> <num> <num> <num>
#> 1: 4.7 1.4 3.2
#> 2: 4.5 1.5 3.2
#> 3: 4.9 1.5 3.1
#> 4: 4.0 1.3 2.3
#> 5: 4.6 1.5 2.8
#> 6: 4.5 1.3 2.8
#>
#> $Sepal.Width_virginica
#> Petal.Length Petal.Width value
#> <num> <num> <num>
#> 1: 6.0 2.5 3.3
#> 2: 5.1 1.9 2.7
#> 3: 5.9 2.1 3.0
#> 4: 5.6 1.8 2.9
#> 5: 5.8 2.2 3.0
#> 6: 6.6 2.1 3.0
# Returns similar structure to Example 2
C:40A1p3ea03fe1bb5-started.R
# Example: Cross-validation for nested data.table demonstrations
# Setup test data
dt_nest <- w2l_nest(
data = iris, # Input dataset
cols2l = 1:2 # Nest first 2 columns
)
# Example 1: Basic 2-fold cross-validation
nest_cv(
nest_dt = dt_nest, # Input nested data.table
v = 2 # Number of folds (2-fold CV)
)
#> name splits id train
#> <fctr> <list> <char> <list>
#> 1: Sepal.Length <vfold_split[75x75x150x4]> Fold1 <data.table[75x4]>
#> 2: Sepal.Length <vfold_split[75x75x150x4]> Fold2 <data.table[75x4]>
#> 3: Sepal.Width <vfold_split[75x75x150x4]> Fold1 <data.table[75x4]>
#> 4: Sepal.Width <vfold_split[75x75x150x4]> Fold2 <data.table[75x4]>
#> validate
#> <list>
#> 1: <data.table[75x4]>
#> 2: <data.table[75x4]>
#> 3: <data.table[75x4]>
#> 4: <data.table[75x4]>
# Example 2: Repeated 2-fold cross-validation
nest_cv(
nest_dt = dt_nest, # Input nested data.table
v = 2, # Number of folds (2-fold CV)
repeats = 2 # Number of repetitions
)
#> name splits id id2 train
#> <fctr> <list> <char> <char> <list>
#> 1: Sepal.Length <vfold_split[75x75x150x4]> Repeat1 Fold1 <data.table[75x4]>
#> 2: Sepal.Length <vfold_split[75x75x150x4]> Repeat1 Fold2 <data.table[75x4]>
#> 3: Sepal.Length <vfold_split[75x75x150x4]> Repeat2 Fold1 <data.table[75x4]>
#> 4: Sepal.Length <vfold_split[75x75x150x4]> Repeat2 Fold2 <data.table[75x4]>
#> 5: Sepal.Width <vfold_split[75x75x150x4]> Repeat1 Fold1 <data.table[75x4]>
#> 6: Sepal.Width <vfold_split[75x75x150x4]> Repeat1 Fold2 <data.table[75x4]>
#> 7: Sepal.Width <vfold_split[75x75x150x4]> Repeat2 Fold1 <data.table[75x4]>
#> 8: Sepal.Width <vfold_split[75x75x150x4]> Repeat2 Fold2 <data.table[75x4]>
#> validate
#> <list>
#> 1: <data.table[75x4]>
#> 2: <data.table[75x4]>
#> 3: <data.table[75x4]>
#> 4: <data.table[75x4]>
#> 5: <data.table[75x4]>
#> 6: <data.table[75x4]>
#> 7: <data.table[75x4]>
#> 8: <data.table[75x4]>
C:40A1p3ea03fe1bb5-started.R
# Example 1: Basic usage with single trait
# This example selects the top 10% of observations based on Petal.Width
# keep_data=TRUE returns both summary statistics and the filtered data
top_perc(iris,
perc = 0.1, # Select top 10%
trait = c("Petal.Width"), # Column to analyze
keep_data = TRUE) # Return both stats and filtered data
#> $Petal.Width_0.1
#> $Petal.Width_0.1$stat
#> # A tibble: 1 × 5
#> variable n mean sd top_perc
#> <fct> <dbl> <dbl> <dbl> <chr>
#> 1 Petal.Width 17 2.34 0.1 10%
#>
#> $Petal.Width_0.1$data
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 6.3 3.3 6.0 2.5 virginica
#> 2 6.5 3.0 5.8 2.2 virginica
#> 3 7.2 3.6 6.1 2.5 virginica
#> 4 5.8 2.8 5.1 2.4 virginica
#> 5 6.4 3.2 5.3 2.3 virginica
#> 6 7.7 3.8 6.7 2.2 virginica
#> 7 7.7 2.6 6.9 2.3 virginica
#> 8 6.9 3.2 5.7 2.3 virginica
#> 9 6.4 2.8 5.6 2.2 virginica
#> 10 7.7 3.0 6.1 2.3 virginica
#> 11 6.3 3.4 5.6 2.4 virginica
#> 12 6.7 3.1 5.6 2.4 virginica
#> 13 6.9 3.1 5.1 2.3 virginica
#> 14 6.8 3.2 5.9 2.3 virginica
#> 15 6.7 3.3 5.7 2.5 virginica
#> 16 6.7 3.0 5.2 2.3 virginica
#> 17 6.2 3.4 5.4 2.3 virginica
# Example 2: Using grouping with 'by' parameter
# This example performs the same analysis but separately for each Species
# Returns nested list with stats and filtered data for each group
top_perc(iris,
perc = 0.1, # Select top 10%
trait = c("Petal.Width"), # Column to analyze
by = "Species") # Group by Species
#> # A tibble: 3 × 6
#> Species variable n mean sd top_perc
#> <fct> <fct> <dbl> <dbl> <dbl> <chr>
#> 1 setosa Petal.Width 9 0.433 0.071 10%
#> 2 versicolor Petal.Width 5 1.66 0.089 10%
#> 3 virginica Petal.Width 6 2.45 0.055 10%
# Example 3: Complex example with multiple percentages and grouping variables
# Reshape data from wide to long format for Sepal.Length and Sepal.Width
iris |>
tidyr::pivot_longer(1:2,
names_to = "names",
values_to = "values") |>
mintyr::top_perc(
perc = c(0.1, -0.2),
trait = "values",
by = c("Species", "names"),
type = "mean_sd")
#> # A tibble: 12 × 7
#> Species names variable n mean sd top_perc
#> <fct> <chr> <fct> <dbl> <dbl> <dbl> <chr>
#> 1 setosa Sepal.Length values 5 5.64 0.134 10%
#> 2 setosa Sepal.Width values 6 4.08 0.194 10%
#> 3 versicolor Sepal.Length values 6 6.8 0.126 10%
#> 4 versicolor Sepal.Width values 5 3.26 0.089 10%
#> 5 virginica Sepal.Length values 5 7.74 0.089 10%
#> 6 virginica Sepal.Width values 5 3.6 0.2 10%
#> 7 setosa Sepal.Length values 11 4.53 0.135 -20%
#> 8 setosa Sepal.Width values 12 2.97 0.219 -20%
#> 9 versicolor Sepal.Length values 11 5.28 0.244 -20%
#> 10 versicolor Sepal.Width values 13 2.35 0.151 -20%
#> 11 virginica Sepal.Length values 11 5.79 0.336 -20%
#> 12 virginica Sepal.Width values 11 2.56 0.15 -20%
C:40A1p3ea03fe1bb5-started.R