<- 75.5 # Double (decimal)
yield <- 120 # Integer (whole number)
num_plots class(yield) # Check the type
[1] "numeric"
class(num_plots) # Often stored as 'numeric' (double) by default
[1] "numeric"
Think of data like building blocks:
Understanding these is fundamental to working with data in R.
R needs to know what kind of information it’s dealing with.
<- 75.5 # Double (decimal)
yield <- 120 # Integer (whole number)
num_plots class(yield) # Check the type
[1] "numeric"
class(num_plots) # Often stored as 'numeric' (double) by default
[1] "numeric"
"
) or single ('
) quotes. Used for IDs, names, descriptions.<- "ICARDA_RustResist"
variety_name <- 'Plot_A101'
plot_id class(variety_name)
[1] "character"
<- TRUE
is_resistant > 80 # This comparison results in a logical value yield
[1] FALSE
class(is_resistant)
[1] "logical"
# Example: Different locations in a trial
<- c("Baku", "Ganja", "Baku", "Sheki", "Ganja")
locations <- factor(locations)
location_factor
print(location_factor) # Shows levels
[1] Baku Ganja Baku Sheki Ganja
Levels: Baku Ganja Sheki
class(location_factor)
[1] "factor"
levels(location_factor) # See the unique categories
[1] "Baku" "Ganja" "Sheki"
How R organizes collections of data:
c()
(combine function).# Vector of plot yields (numeric)
<- c(75.5, 81.2, 78.9, 85.0)
plot_yields # Vector of variety names (character)
<- c("ICARDA_Gold", "Local_Check", "ICARDA_Gold",
plot_varieties "ICARDA_RustResist")
# Vector of resistance status (logical)
<- c(TRUE, FALSE, TRUE, TRUE)
plot_resistance
1] # Access the first element (Indexing starts at 1!) plot_yields[
[1] 75.5
2:4] # Access elements 2 through 4 plot_yields[
[1] 81.2 78.9 85.0
length(plot_yields) # Get the number of elements
[1] 4
Important: If you mix types in c()
, R will force them into a single common type (usually character).
<- c(10, "VarietyA", TRUE)
mixed_vector print(mixed_vector) # All become character strings!
[1] "10" "VarietyA" "TRUE"
class(mixed_vector)# Example: Small genotype matrix (Individuals x SNPs)
[1] "character"
# Example: Small genotype matrix (Individuals x SNPs)
<- matrix(c(0, 1, 2, 1, 1, 0), nrow = 2, ncol = 3, byrow = TRUE)
genotype_data rownames(genotype_data) <- c("Line1", "Line2")
colnames(genotype_data) <- c("SNP1", "SNP2", "SNP3")
print(genotype_data)
SNP1 SNP2 SNP3
Line1 0 1 2
Line2 1 1 0
class(genotype_data)
[1] "matrix" "array"
dim(genotype_data) # Get dimensions (rows, columns)
[1] 2 3
1, 2] # Access element row 1, column 2 genotype_data[
[1] 1
Data Frame: The most important data structure for breeders! Like a spreadsheet or table in R.
# Create a simple breeding trial data frame
<- data.frame(
trial_data PlotID = c("A101", "A102", "B101", "B102"),
Variety = factor(c("ICARDA_Gold", "Local_Check", "ICARDA_RustResist",
"ICARDA_Gold")),
Yield_kg_plot = c(5.2, 4.5, 6.1, 5.5),
Is_Resistant = c(TRUE, FALSE, TRUE, TRUE)
)
print(trial_data)
PlotID Variety Yield_kg_plot Is_Resistant
1 A101 ICARDA_Gold 5.2 TRUE
2 A102 Local_Check 4.5 FALSE
3 B101 ICARDA_RustResist 6.1 TRUE
4 B102 ICARDA_Gold 5.5 TRUE
class(trial_data)
[1] "data.frame"
str(trial_data) # Structure: Shows types of each column - VERY USEFUL!
'data.frame': 4 obs. of 4 variables:
$ PlotID : chr "A101" "A102" "B101" "B102"
$ Variety : Factor w/ 3 levels "ICARDA_Gold",..: 1 3 2 1
$ Yield_kg_plot: num 5.2 4.5 6.1 5.5
$ Is_Resistant : logi TRUE FALSE TRUE TRUE
head(trial_data) # Show first few rows
PlotID Variety Yield_kg_plot Is_Resistant
1 A101 ICARDA_Gold 5.2 TRUE
2 A102 Local_Check 4.5 FALSE
3 B101 ICARDA_RustResist 6.1 TRUE
4 B102 ICARDA_Gold 5.5 TRUE
summary(trial_data) # Summary statistics for each column
PlotID Variety Yield_kg_plot Is_Resistant
Length:4 ICARDA_Gold :2 Min. :4.500 Mode :logical
Class :character ICARDA_RustResist:1 1st Qu.:5.025 FALSE:1
Mode :character Local_Check :1 Median :5.350 TRUE :3
Mean :5.325
3rd Qu.:5.650
Max. :6.100
# Access columns using $
$Yield_kg_plot trial_data
[1] 5.2 4.5 6.1 5.5
mean(trial_data$Yield_kg_plot) # Calculate mean of a column
[1] 5.325
(We will work extensively with data frames).
<- list(
analysis_results description = "Yield Trial - Baku 2023",
raw_data = trial_data, # Include the data frame
significant_snps = c("SNP101", "SNP504"), # A character vector
model_parameters = list(threshold = 0.05, method = "MLM") # A nested list
)
analysis_results
$description
[1] "Yield Trial - Baku 2023"
$raw_data
PlotID Variety Yield_kg_plot Is_Resistant
1 A101 ICARDA_Gold 5.2 TRUE
2 A102 Local_Check 4.5 FALSE
3 B101 ICARDA_RustResist 6.1 TRUE
4 B102 ICARDA_Gold 5.5 TRUE
$significant_snps
[1] "SNP101" "SNP504"
$model_parameters
$model_parameters$threshold
[1] 0.05
$model_parameters$method
[1] "MLM"
print(analysis_results$description)
[1] "Yield Trial - Baku 2023"
print(analysis_results$raw_data) # Access the data frame inside the list
PlotID Variety Yield_kg_plot Is_Resistant
1 A101 ICARDA_Gold 5.2 TRUE
2 A102 Local_Check 4.5 FALSE
3 B101 ICARDA_RustResist 6.1 TRUE
4 B102 ICARDA_Gold 5.5 TRUE
print(analysis_results$model_parameters)
$threshold
[1] 0.05
$method
[1] "MLM"