Cholangiocarcinoma_Sequential-Chemotherapy_Predictive-Tests.Rmd

---
title: "Cholangiocarcinoma, Sequential Chemotherapy, and Predictive Tests"
author: "AJ Book"
output:
  word_document: default
  pdf_document: default
  html_document: default
editor_options:
  markdown:
    wrap: 72
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
# Clear the entire environment
rm(list = ls())

setwd("C:/Users/ajboo/BookAbraham/RProjects/MZBSurvivalAnalysis")

# Define the output directory path
output_dir <- "output"

# Create the output directory if it doesn't exist
if (!dir.exists(output_dir)) {
  dir.create(output_dir)
}

# Set the output directory for plots
knitr::opts_chunk$set(fig.path = paste0(output_dir, "/plot", "-"))


```


## Load Libraries

This section is reserved for libraries we will use throughout this RMD
file and any imported modules

```{python imports}
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import KaplanMeierFitter, CoxPHFitter

```

```{r libraries, echo=TRUE}
library(tidyverse)
library(survival)
library(survminer)
library(ggsci)
library(knitr)
library(ggsurvfit)
library(gt)
library(reticulate)
library(maxstat)
```


Import the data file


```{python load and convert data}
# Define the function to load and preprocess data
def load_and_convert_data(file_path, cancer_type):
    # Load data from CSV file
    df = pd.read_csv(file_path)
    
    # Subset data for the specified cancer type
    cancer_df = df[df['Cancer_Type'] == cancer_type].copy()  # Create a copy
    
    # Convert selected columns to categorical variables
    factors = ['Gender', 'Cancer_Type', 'Prior_Tx', 'Resistant', 'Cancer_Status', 'Risk_Group_ALAN']
    cancer_df[factors] = cancer_df[factors].astype('category')

        # Print message indicating successful loading
    print("Data for", cancer_type, "loaded successfully.")
    
    return cancer_df

# Load and preprocess data for Cholangiocarcinoma
cholangio_df = load_and_convert_data("data/Organized_Bruckner_Data.csv", "Cholangiocarcinoma")

```


```{python recode ALAN}
# Recode Risk_Group_ALAN column
# Define the bins and labels
bins = [-1, 0, 2, 4]
labels = ['Low_Risk', 'Intermediate_Risk', 'High_Risk']

# Recode Risk_Group_ALAN column based on Prognostic_Score_ALAN
cholangio_df['Risk_Group_ALAN'] = pd.cut(cholangio_df['Prognostic_Score_ALAN'], bins=bins, labels=labels, include_lowest=False)

```

Examine the variables within your data 
```{python examine subset}
# Glimpse at the subsetted data frames
print("\n Cholangiocarcinoma Data Frame:")
print(cholangio_df.head())


```


Determine the types of class each column contains as its datatype
```{python examine type}
# Check data types of columns in DataFrame
print(cholangio_df.dtypes)

```
## Numeric Summary

Step 1: calculate the numeric statistics of the cholangio_df
data frame #Note:You can specify percentiles, quantiles and normality or you can give specific percentiles depending on what you are interested in looking at this specific usage is looking at the 33rd and 67th percentiles of the data

Step 2: Create histograms, boxplots and distribution curves to visualize the descriptive statistics of the numeric variables.

```{python numeric summary}
def calculate_numeric_statistics(data):
    # Select only numeric columns
    numeric_data = data.select_dtypes(include=np.number)
    
    # Calculate descriptive statistics
    descriptive_stats = numeric_data.describe().transpose()
    
    # Calculate interquartile range (IQR) and include quantiles (25th, 50th, and 75th percentiles)
    quantiles = numeric_data.quantile([0.25, 0.5, 0.75], axis=0).transpose()
    quantiles["IQR"] = quantiles[0.75] - quantiles[0.25]
    quantiles.columns = ["Q1", "Median", "Q3", "IQR"]
    
    # Calculate additional percentiles (33rd and 67th)
    custom_percentiles = np.percentile(numeric_data, [33, 67], axis=0)
    custom_percentiles_df = pd.DataFrame(custom_percentiles.T, columns=["33rd Percentile", "67th Percentile"], index=numeric_data.columns)
    
    # Combine all statistics
    stats_combined = pd.concat([descriptive_stats, quantiles, custom_percentiles_df], axis=1)
    
    return stats_combined

# Create summary statistics table for NPT Cholangiocarcinoma
cholangio_stats = calculate_numeric_statistics(cholangio_df)


# Display the tables
print("Summary statistics for Cholangiocarcinoma:")
print(cholangio_stats)
```

```{r advanced numeric summary}

#Load Util functions
source("Utils.R")

# Generate the first table for Resistant Cholangiocarcinoma
cholangio_table <- calc_num_stats(py$cholangio_df, selected_labels = c("Quantiles", "Percentiles"), percentiles = c(33, 67), title = "Numeric Statistics for Cholangiocarcinoma")

# Save the table as an image
gtsave(cholangio_table, filename = file.path(output_dir, "cholangio_table.png"))

```

```{python numeric distribution, echo=FALSE}
import matplotlib.pyplot as plt
import seaborn as sns

# Define the function to plot histograms and boxplots for one variable
def plot_numeric_statistics(df, variable, subset):
    # Create subplots
    fig, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
    
    # Plot boxplot
    sns.boxplot(x=df[variable], ax=ax_box, color='orange', width=0.3, linewidth=1.5, showmeans=True, meanline=True,
                meanprops=dict(color='black', linestyle='--', linewidth=2),
                medianprops=dict(color='black', linewidth=2))
    ax_box.set_ylabel(variable)
    
    # Calculate mean and std_dev
    mean = df[variable].mean()
    std_dev = df[variable].std()
    
    # Plot histogram with density function
    sns.histplot(df[variable], kde=True, bins=12, stat='density', color='skyblue', ax=ax_hist)
    ax_hist.set_xlabel(variable)
    ax_hist.set_ylabel('Density')
    
    # Add lines for mean and mean +/- std_dev to the histogram
    ax_hist.axvline(mean, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean:.2f}')
    ax_hist.axvline(mean + std_dev, color='purple', linestyle='--', linewidth=2, label=f'Mean + Std Dev: {mean + std_dev:.2f}')
    ax_hist.axvline(mean - std_dev, color='purple', linestyle='--', linewidth=2, label=f'Mean - Std Dev: {mean - std_dev:.2f}')
    
    # Add label for the IQR on the boxplot
    q1 = df[variable].quantile(0.25)
    q3 = df[variable].quantile(0.75)
    iqr = q3 - q1
    ax_box.text(0.5, 0.5, 'IQR', color='black', ha='center', fontsize=10, transform=ax_box.transAxes)
    
    # Remove y-axis ticks for boxplot
    ax_box.set_yticks([])
    
    # Despine the plots
    sns.despine(ax=ax_hist)
    sns.despine(ax=ax_box, left=True)
    
    # Set common xlabel
    plt.xlabel(variable)
    
    # Add title to the entire plot
    plt.suptitle(f'{subset} - by {variable}')
    
    # Show the plot
    plt.tight_layout()
    
    # Save the plot as an image
    plt.savefig(f'output/{subset}_{variable}_plot.png')
    
    # Close the plot to release memory
    plt.close()

# List of columns to exclude from numeric variables
exclude_columns = ['Prognostic_Score_ALAN', 'Event_Status']

# Iterate over each numeric variable in your dataset and call the plot_numeric_statistics function
for column in cholangio_df.select_dtypes(include=['int64', 'float64']).columns:
    if column not in exclude_columns:
        plot_numeric_statistics(cholangio_df, column, 'Cholangiocarcinoma')

```


```{r determine cutpoints}

# Define cutoff points for Albumin, LMR, PLT, LY, ANC, NLR, Alk_Phos, and Prognostic_Score_ALAN
cutoff_points <- list(
  Albumin = 3.5,
  LMR = 2.1,
  PLT = 300,
  LY = 1.5,
  MON = 0.8,
  ANC = c(4, 8),
  NLR = c(3, 5),
  Alk_Phos = c(135, 200),
  Prognostic_Score_ALAN = c(0, 2, 4),
  Age = c(60, 65, 70)
)

# Function to categorize values based on cutoff points
categorize_values <- function(df) {
  for (variable in names(cutoff_points)) {
    if (variable %in% colnames(df)) {
      if (variable == "Prognostic_Score_ALAN") {
        df[[paste0(variable, "_category")]] <- cut(df[[variable]], 
                                                    breaks = c(-Inf, 0, 2, Inf),
                                                    labels = c("0", "1-2", "3-4"))
      } else if (is.numeric(cutoff_points[[variable]])) {
        for (cutoff in cutoff_points[[variable]]) {
          category_column <- ifelse(df[[variable]] < cutoff, 
                                    paste0("< ", cutoff), 
                                    paste0(">= ", cutoff))
          df <- cbind(df, category_column)
          colnames(df)[ncol(df)] <- paste0(variable, "_", cutoff)
        }
      } else {
        cutoff <- cutoff_points[[variable]]
        category_column <- cut(df[[variable]], 
                               breaks = c(-Inf, cutoff, Inf),
                               labels = c(paste0("< ", cutoff), 
                                          paste0(">= ", cutoff)))
        df <- cbind(df, category_column)
        colnames(df)[ncol(df)] <- paste0(variable, "_category")
      }
    } else {
      cat(paste("Column '", variable, "' not found in the DataFrame.\n"))
    }
  }
  return(df)
}


# Apply categorization to each DataFrame
categorized_cholangio_df <- categorize_values(py$cholangio_df)

# Check the result
print("Categorized cholangio DataFrame:")
print(head(categorized_cholangio_df))


```

```{r convert to factors}

# Function to convert specified columns to factors
convert_to_factors <- function(df, columns_to_convert) {
    df[, columns_to_convert] <- lapply(df[, columns_to_convert], factor)
    return(df)
}

# Columns to convert to factors
columns_to_convert <- c('Age_60', 'Age_65', 'Age_70', 'Albumin_3.5', 'LMR_2.1', 'PLT_300', 'LY_1.5', 'MON_0.8', 
                        'ANC_4', 'ANC_8', 'NLR_3', 'NLR_5', 'Alk_Phos_135', 'Alk_Phos_200')

# Convert columns to factors for categorized_cholangio_df
categorized_cholangio_df <- convert_to_factors(categorized_cholangio_df, columns_to_convert)


# Check the structure of the dataframes
str(categorized_cholangio_df)


```


## Categoric Summary

Calculate the Categorical statistics for our new cholangio data frame

```{python calculate categoric}
def calculate_categorical_statistics(data, title="Categorical Statistics"):
    # Check if data is a DataFrame
    if not isinstance(data, pd.DataFrame):
        raise ValueError("Input 'data' must be a pandas DataFrame.")
    
    # Drop the 'ID' column if it exists
    data = data.drop(columns=['ID'], errors='ignore')
    
    # Initialize an empty list to store results
    result_list = []
    
    # Iterate over each non-numeric variable
    for var in data.select_dtypes(exclude=['number']).columns:
        # Get value counts for the current variable
        categories = data[var].value_counts()
        
        # Append the results to the list
        result_list.append(pd.DataFrame({
            'Variable': [var] * len(categories),
            'Levels': categories.index,
            'UniqueValues': len(categories),
            'Frequencies': categories.values.tolist(),
            'Proportions': (categories / categories.sum()).map(lambda x: f"{x:.2%}").tolist()
        }))
    
    # Concatenate the individual DataFrames into one
    result = pd.concat(result_list, ignore_index=True)
    
    # Return result DataFrame
    return result
```

```{python categorized stats}
import warnings

# Suppress FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)


# Calling the Python function on the R data frames
categorized_cholangio_stats = calculate_categorical_statistics(r.categorized_cholangio_df)


print(categorized_cholangio_stats)


```


```{r advanced categoric summary}

library(gt)

# Define a function to save gt tables as images
save_gt_as_image <- function(table, filename) {
  gtsave(table, filename = filename, path = "output")
}

# Call the calc_cat_stats function and save the resulting gt tables
cat_stats_cholangio <- calc_cat_stats(categorized_cholangio_df, title = "Categoric Statistics for Cholangiocarcinoma")
save_gt_as_image(cat_stats_cholangio, "categoric_stats_cholangio.png")

```

```{python categoric distribution}
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

def plot_combined_categorical_statistics(data, title="Categorical Statistics"):
    # Create a copy of the data to avoid modifying the original DataFrame
    data_copy = data.copy()
    
    # Remove rows where the split is 100% to 0%
    data_copy = data_copy[(data_copy['Proportions'] != '100.00%') & (data_copy['Proportions'] != '0.00%')]
    
    # Exclude the 'Risk' column
    data_copy = data_copy[data_copy['Variable'] != 'Risk_Group_ALAN']
    
    # Convert Proportions column to numeric
    data_copy['Proportions'] = data_copy['Proportions'].str.rstrip('%').astype(float)
    
    # Combine similar variables
    data_copy['Variable'] = data_copy['Variable'].str.split('_').str[0]  # Extract the part before '_'
    
    # Group by Variable and Levels, calculate mean and standard error of proportions
    grouped_data = data_copy.groupby(['Variable', 'Levels'])['Proportions'].agg(['mean', 'sem']).reset_index()
    
    # Initialize the plot
    sns.set(style="whitegrid")
    plt.figure(figsize=(16, 8))  # Increase figure width
    
    # Create the bar plot
    sns.barplot(data=grouped_data, x='Levels', y='mean', hue='Variable')
    
    # Add error bars
    plt.errorbar(x=np.arange(len(grouped_data['Levels'].unique())), y=grouped_data['mean'], 
                 yerr=grouped_data['sem'], fmt='none', ecolor='black', capsize=3)  # Adjust capsize
    
    # Add labels above each bar
    for index, row in grouped_data.iterrows():
        plt.text(index, row['mean'], f"{row['mean']:.1f}", ha='center', va='bottom', fontsize=6)
    
    # Set title and labels with adjusted font size
    plt.title(title, fontsize=16)
    plt.xlabel('Levels', fontsize=14)
    plt.ylabel('Proportion', fontsize=14)
    plt.xticks(rotation=45, fontsize=8, ha='right')  # Rotate x-axis labels and adjust font size
    plt.yticks(fontsize=8)  # Adjust font size for y-axis labels
    
    # Adjust legend size and position
    plt.legend(title='Variable', fontsize=6, title_fontsize=8, loc='upper right')
    
    # Adjust spacing
    plt.tight_layout()  # Adjust spacing
    
    # Show plot
    plt.show()

# Example usage:
# Plot combined categorical statistics
plot_combined_categorical_statistics(categorized_cholangio_stats, title="Categorical Statistics for Cholangiocarcinoma")


plt.close()  # Close the plot to avoid displaying it again later


```


## Survival Analysis

```{r Suvival Object}


# Create survival object for categorized_cholangio_df
surv_obj <- Surv(time = categorized_cholangio_df$time_diff_months, event = categorized_cholangio_df$Event_Status)
```

# Overall Kaplan Meier 

```{r Kaplan Meier}

# Load required libraries
library(survival)
library(survminer)
library(ggsci)
library(ggsurvfit)
library(ggplotify)

# Check if 'ggsurvplot' is loaded in the namespace
  if (!"ggsurvplot" %in% loadedNamespaces()) {
    library(survminer)
  }

kmfit <- survfit(Surv(time = categorized_cholangio_df$time_diff_months, event = categorized_cholangio_df$Event_Status) ~1, data = categorized_cholangio_df)

kmplot <- ggsurvplot(kmfit,
                              data = categorized_cholangio_df,
                              title = "Survival Curve for Cholangiocarcinoma",
                              censor = TRUE,
                              xlab = "Time (Months)",
                              ylab = "Survival Probability",
                              conf.int = TRUE,
                              conf.int.style = "step",
                              conf.int.alpha = 0.2,
                              ggtheme = theme_minimal(),
                              surv.median.line = "hv",
                              xlim = c(0, 24),
                              break.time.by = 3,
                              breaks = seq(0, 24, by = 3),
                              surv.scale = "percent",
                              legend.labs = paste("Cholangiocarcinoma (N =", nrow(categorized_cholangio_df),")"),
                              palette = "lancet")

kmplot <- kmplot + ggsurvfit::theme_ggsurvfit_KMunicate()

kmplot


# Convert to gg objects
kmplot_gg <- kmplot$plot

# Save the ggplot objects
ggsave(filename = "output/kmplot.png", plot = kmplot_gg, width = 10, height = 6)


```
```{r}

levels <- levels(categorized_cholangio_df$Cancer_Status)

    label1 <- paste(levels[1], " (N =", sum(categorized_cholangio_df$Cancer_Status == levels[1]), ")", sep = "")
      label2 <- paste(levels[2], " (N =", sum(categorized_cholangio_df$Cancer_Status == levels[2]), ")", sep = "")
```


```{r resistant KM overall}

resistant_kmfit <- survfit(Surv(time = categorized_cholangio_df$time_diff_months, event = categorized_cholangio_df$Event_Status) ~ categorized_cholangio_df$Cancer_Status, data = categorized_cholangio_df)


resistant_kmplot <- ggsurvplot(resistant_kmfit,
                                  data = categorized_cholangio_df,
                                  title = "Survival Curve for Resistant- vs. NPT- Cholangiocarcinoma",
                                  censor = TRUE,
                                  xlab = "Time (Months)",
                                  ylab = "Survival Probability",
                                  conf.int = TRUE,
                                  conf.int.style = "step",
                                  conf.int.alpha = 0.2,
                                  ggtheme = theme_minimal(),
                                  surv.median.line = "hv",
                                  xlim = c(0, 24),
                                  break.time.by = 3,
                                  breaks = seq(0, 24, by = 3),
                                  legend.labs = c(label1, label2),
                                  palette = "lancet")

resistant_kmplot <- resistant_kmplot + ggsurvfit::theme_ggsurvfit_KMunicate()

resistant_kmplot
  
resistant_gg <- resistant_kmplot$plot
# Save the ggplot objects
ggsave(filename = "output/resistant_v_npt_kmplot.png", plot = resistant_gg, width = 10, height = 6)
```


```{r rename df}
cca_df <- categorized_cholangio_df
colnames(cca_df)
```

## Km Fit Curve

```{r KM Fit}
# Define column names, variables, and cutoffs
column_names <- c("Cancer_Status","Albumin_3.5", "LMR_2.1", "MON_0.8", "LY_1.5", "ANC_4", "ANC_8", "NLR_3", "NLR_5", "PLT_300", "Alk_Phos_135", "Alk_Phos_200", "Age_60", "Age_65", "Age_70", "Prognostic_Score_ALAN_category")

# Initialize a list to store survival fits for NPT Cholangiocarcinoma 
cca_kmfits <- list()

# Loop through variables for NPT Cholangiocarcinoma
for (col in column_names) {
  # Construct formula with variable name extracted from column name
  formula <- as.formula(paste("surv_obj ~", col))
  
  # Fit Kaplan-Meier survival curve
  cca_kmfit <- survfit(formula, data = cca_df)
  
  # Store the fit in the list with a descriptive name
  cca_kmfits[[paste("cca_kmfit_", col, sep = "")]] <- cca_kmfit
}

# Access results using names like km_fit_Albumin_3.5 etc.
print("Cholangiocarcinoma Survival Fits:")
print(cca_kmfits)

# Add a line of dashes for separation
cat("\n", paste(rep("-", 40), collapse = ""), "\n")

```
##LogRank


```{r Log-Rank Test}
 
# Define column names
column_names <- c("Albumin_3.5", "LMR_2.1", "MON_0.8", "LY_1.5", "ANC_4", "ANC_8", "NLR_3", "NLR_5", "PLT_300", "Alk_Phos_135", "Alk_Phos_200", "Age_60", "Age_65", "Age_70", "Prognostic_Score_ALAN_category", "Cancer_Status")

# Initialize an empty data frame to store log-rank test results for  Cholangiocarcinoma
log_rank_results_df <- data.frame(
  variable = character(),
  cutoff = numeric(),
  logrank_statistic = numeric(),
  logrank_p_value = numeric(),
  stringsAsFactors = FALSE
)

# Loop through variables for  Cholangiocarcinoma
for (col in column_names) {
  if (grepl("^Prognostic_Score_ALAN_Category", col)) {
    # Treat categorical variable as a factor
    formula <- as.formula(paste("surv_obj ~ factor(", col, ")"))
    
    # Perform log-rank test
    cca_logrank <- survdiff(formula, data = cca_df)
    
    # Store log-rank test results in data frame
    log_rank_results_df <- rbind(log_rank_results_df, data.frame(
      variable = col,
      cutoff = "N/A",
      logrank_statistic = cca_logrank$chisq,
      logrank_p_value = 1 - pchisq(cca_logrank$chisq, df = 1),
      stringsAsFactors = FALSE
    ))
    
    # Print log-rank test information for Cholangiocarcinoma
    cat(rep("-", 20), "\n")
    cat("Log-rank tests for Cholangiocarcinoma -", col, "\n")
    cat(rep("-", 20), "\n")
    print(cca_logrank)
  } else {
    # Extract cutoff from column name using regular expression
    cutoff <- as.numeric(sub("^.*_(\\d+(\\.\\d+)?)$", "\\1", col))
    formula <- as.formula(paste("surv_obj ~", col))
  
    # Perform log-rank test
    cca_logrank <- survdiff(formula, data = cca_df)
  
    # Store log-rank test results in data frame
    log_rank_results_df <- rbind(log_rank_results_df, data.frame(
      variable = col,
      cutoff = cutoff,
      logrank_statistic = cca_logrank$chisq,
      logrank_p_value = 1 - pchisq(cca_logrank$chisq, df = 1),
      stringsAsFactors = FALSE
    ))
  
    # Print log-rank test information for Cholangiocarcinoma
    cat(rep("-", 20), "\n")
    cat("Log-rank tests for Cholangiocarcinoma -", col, "\n")
    cat(rep("-", 20), "\n")
    print(cca_logrank)
  }
}


# Display log-rank test results in a table
kable(log_rank_results_df, caption = "Log-rank Test Results for Cholangiocarcinoma")

```

### Pairwise LogRank

These results are pairwise log-rank tests comparing different levels of the variable "Prognostic_Score_ALAN_category" within the data. Let's interpret each pairwise comparison:


```{r Pairwise Logrank}
# Get unique levels of Prognostic_Score_ALAN_category
levels_cca <- unique(cca_df$Prognostic_Score_ALAN_category)

# Initialize a list to store pairwise log-rank test results
pairwise_results_cca <- list()

# Perform pairwise log-rank tests
for (i in 1:(length(levels_cca)-1)) {
  for (j in (i+1):length(levels_cca)) {
    level1 <- levels_cca[i]
    level2 <- levels_cca[j]
    cat("Pairwise log-rank test between", level1, "and", level2, "\n")
    formula <- as.formula(paste("Surv(time_diff_months, Event_Status) ~ Prognostic_Score_ALAN_category"))
    pairwise_test <- survdiff(formula, subset(cca_df, Prognostic_Score_ALAN_category %in% c(level1, level2)))
    print(pairwise_test)
    cat("\n")
    # Store the pairwise test result
    pairwise_results_cca[[paste("pairwise_test_", level1, "_vs_", level2, sep = "")]] <- pairwise_test
  }
}

# Print the results
print("Pairwise log-rank test results for Cholangiocarcinoma:")
print(pairwise_results_cca)

```

##Cox Proportional Hazards

```{r Cox Proportional Hazards}
# Initialize lists to store Cox models, p-values, and hazard ratios for  Cholangiocarcinoma
cox_p_values_list_cca <- list()
cox_hazard_ratios_list_cca <- list()

# Loop through variables for Cholangiocarcinoma
for (col in column_names) {
  
   # Extract cutoff and variable name using regular expressions
  cutoff <- as.numeric(sub("^.*_(\\d+(\\.\\d+)?)$", "\\1", col))
  variable <- sub("^(.*)_\\d+(\\.\\d+)?$", "\\1", col)
  
  # Create formula for Cox model
  formula_cca <- as.formula(paste("Surv(time_diff_months, Event_Status) ~", col))
    
    # Fit Cox model
    cox_model_cca <- coxph(formula_cca, data = cca_df)
    
    # Print Cox model for Cholangiocarcinoma
    cat(rep("-", 30), "\n")
    cat("Cox Proportional Hazards for Cholangiocarcinoma  -", col, "\n")
    cat(rep("-", 30), "\n")
    print(cox_model_cca)
    
    # Create properly formatted column name for p-value extraction
    coef_name_cca <- paste(col, ">= ", cutoff, sep = "")
    
    # Check if the variable is Prognostic_Score_ALAN_category
    if (variable == "Prognostic_Score_ALAN_category") {
      # Treat categorical variable as a factor
      formula_cca <- as.formula(paste("Surv(time_diff_months, Event_Status) ~ factor(", col, ")"))
      
      # Perform Cox model for Prognostic_Score_ALAN_category
      cox_model_cca <- coxph(formula_cca, data = cca_df)
      
      # Extract p-value
      cox_p_value_cca <- as.numeric(format(summary(cox_model_cca)$coefficients[, "Pr(>|z|)"], scientific = TRUE, digits = 3))
      
      # Print p-value for Cholangiocarcinoma
      cat("P-value:", cox_p_value_cca, "\n")
      
      
    } else if (variable == "Cancer_Status"){
      # Treat categorical variable as a factor
      formula_cca <- as.formula(paste("Surv(time_diff_months, Event_Status) ~ factor(", col, ")"))
      
      # Perform Cox model for Cancer_Status
      cox_model_cca <- coxph(formula_cca, data = cca_df)
      
      # Extract p-value
      cox_p_value_cca <- as.numeric(format(summary(cox_model_cca)$coefficients[, "Pr(>|z|)"], scientific = TRUE, digits = 3))
      
      # Print p-value for Cholangiocarcinoma
      cat("P-value:", cox_p_value_cca, "\n")
    
    } else {
      # Extract p-value
      cox_p_value_cca <- as.numeric(format(summary(cox_model_cca)$coefficients[coef_name_cca, "Pr(>|z|)"], scientific = TRUE, digits = 3))
      
      # Print p-value for Cholangiocarcinoma
      cat("P-value:", cox_p_value_cca, "\n")
    }
    
    # Extract Hazard Ratio
    cox_hazard_ratio_cca <- exp(coef(cox_model_cca))
    
    # Print Hazard Ratio for Cholangiocarcinoma
    cat("Hazard Ratio:", cox_hazard_ratio_cca, "\n")
    
    # Append the results to respective lists for cca Cholangiocarcinoma
    cox_p_values_list_cca[[length(cox_p_values_list_cca) + 1]] <- c(col, cutoff, cox_p_value_cca)
    cox_hazard_ratios_list_cca[[length(cox_hazard_ratios_list_cca) + 1]] <- c(col, cutoff, cox_hazard_ratio_cca)
    
}

# Convert lists to data frames for cca Cholangiocarcinoma
cox_p_values_df_cca <- as.data.frame(do.call(rbind, cox_p_values_list_cca), stringsAsFactors = FALSE)
colnames(cox_p_values_df_cca) <- c("column_names", "cutoff", "cox_p_value")
cox_p_values_df_cca$cutoff <- as.numeric(cox_p_values_df_cca$cutoff)
cox_p_values_df_cca$cox_p_value <- as.numeric(cox_p_values_df_cca$cox_p_value)

cox_hazard_ratios_df_cca <- as.data.frame(do.call(rbind, cox_hazard_ratios_list_cca), stringsAsFactors = FALSE)
colnames(cox_hazard_ratios_df_cca) <- c("column_names", "cutoff", "cox_hazard_ratio")
cox_hazard_ratios_df_cca$cutoff <- as.numeric(cox_hazard_ratios_df_cca$cutoff)
cox_hazard_ratios_df_cca$cox_hazard_ratio <- as.numeric(cox_hazard_ratios_df_cca$cox_hazard_ratio)

# Merge the two data frames for cca Cholangiocarcinoma
coxph_df_cca <- merge(cox_p_values_df_cca, cox_hazard_ratios_df_cca, by = c("column_names", "cutoff"), sort = FALSE)

# Select only the relevant columns
coxph_df_cca <- coxph_df_cca[, c("column_names", "cutoff", "cox_p_value", "cox_hazard_ratio")]

# Print the combined data frame for cca Cholangiocarcinoma using kable
kable(coxph_df_cca, caption = "Cox Proportional Hazards Results for Cholangiocarcinoma")

# Print the structure of the combined data frame for cca Cholangiocarcinoma
str(coxph_df_cca)

```


##Schoenfeld Residuals Test

```{r Schoenfeld Test}
# Initialize lists to store Schoenfeld test results and plots for Resistant Cholangiocarcinoma
schoenfeld_results_list <- list()
schoenfeld_plots_list <- list()

# Loop through variables for Resistant Cholangiocarcinoma
for (col in column_names) {
  # Create formula for Cox model
  formula_cca <- as.formula(paste("Surv(time_diff_months, Event_Status) ~", col))
  
  # Fit Cox model for Resistant Cholangiocarcinoma
  cox_model_cca <- coxph(formula_cca, data = cca_df)
  
  # Perform Schoenfeld test for Resistant Cholangiocarcinoma
  schoenfeld_test_cca <- cox.zph(cox_model_cca)
  
  # Print Schoenfeld test results for Resistant Cholangiocarcinoma
  cat(rep("-", 45), "\n")
  cat("Schoenfeld Test for Cholangiocarcinoma -", col, "\n")
  cat(rep("-", 45), "\n")
  print(schoenfeld_test_cca)
  
  # Store Schoenfeld test result for Cholangiocarcinoma in the list
  schoenfeld_results_list[[paste("schoenfeld_test", tolower(col), sep = "_")]] <- schoenfeld_test_cca
  
  # Plot Schoenfeld residuals using ggcoxzph for Cholangiocarcinoma
  schoenfeld_plot_cca <- ggcoxzph(schoenfeld_test_cca, caption = paste("Schoenfeld Plot of Cholangiocarcinoma for residuals of", col))
  
  # Store Schoenfeld plot for Cholangiocarcinoma in the list
  schoenfeld_plots_list[[paste("schoenfeld_plot", tolower(col), sep = "_")]] <- schoenfeld_plot_cca
  
  # Print the plot for Resistant Cholangiocarcinoma
  print(schoenfeld_plot_cca)
}

# Access results using names like schoenfeld_test_ly etc. for Cholangiocarcinoma
print(schoenfeld_results_list)
print(schoenfeld_plots_list)

```


#KM Plots

```{r legend labels}
# Define the column names
column_names <- c("Albumin_3.5", "LMR_2.1", "MON_0.8", "LY_1.5", "ANC_4", "ANC_8", "NLR_3", "NLR_5", "PLT_300", "Alk_Phos_135", "Alk_Phos_200", "Age_60", "Age_65", "Age_70", "Cancer_Status","Prognostic_Score_ALAN_category")

# Create a separate list of titles for legend labels
legend_titles <- c("Albumin ", "LMR ", "MON ", "LY ", "ANC ", "ANC ", "NLR ", "NLR ", "PLT ", "Alk_Phos ", "Alk_Phos ", "Age ", "Age ", "Age ","", "ALAN_Score:")


# Function to create legend labels for each variable
create_legend_labels <- function(df, column_names, legend_titles) {
  legend_labels <- list()
  for (i in seq_along(column_names)) {
    variable <- column_names[i]
    title <- legend_titles[i]
    levels <- levels(df[[variable]])
    if (length(levels) == 2) {
      label1 <- paste(title, levels[1], " (N =", sum(df[[variable]] == levels[1]), ")", sep = "")
      label2 <- paste(title, levels[2], " (N =", sum(df[[variable]] == levels[2]), ")", sep = "")
      legend_labels[[variable]] <- c(label1, label2)
    } else if (length(levels) == 3) {
      label1 <- paste(title, levels[1], " (N =", sum(df[[variable]] == levels[1]), ")", sep = "")
      label2 <- paste(title, levels[2], " (N =", sum(df[[variable]] == levels[2]), ")", sep = "")
      label3 <- paste(title, levels[3], " (N =", sum(df[[variable]] == levels[3]), ")", sep = "")
      legend_labels[[variable]] <- c(label1, label2, label3)
    } else {
      labels <- paste(title, levels, " (N =", table(df[[variable]]), ")", sep = "")
      legend_labels[[variable]] <- labels
    }
  }
  return(legend_labels)
}


# Call the function to create legend labels for each variable
legend_labels_cca <- create_legend_labels(cca_df, column_names, legend_titles)


print(legend_labels_cca)

```


```{r create our models}
# Define a function to fit Cox models for each variable in a data frame
fit_cox_models <- function(data, column_names) {
  cox_models <- list()
  for (col in column_names) {
    formula <- as.formula(paste("Surv(time_diff_months, Event_Status) ~", col))
    cox_model <- coxph(formula, data = data)
    cox_models[[col]] <- cox_model
  }
  return(cox_models)
}

# Define a function to fit Kaplan-Meier models for each variable in a data frame
fit_km_models <- function(data, column_names) {
  km_models <- list()
  for (col in column_names) {
    formula <- as.formula(paste("Surv(time_diff_months, Event_Status) ~", col))
    km_model <- survfit(formula, data = data)
    km_models[[col]] <- km_model
  }
  return(km_models)
}

# Fit Cox models for cca data frame
cox_models_cca <- fit_cox_models(cca_df, column_names)


# Fit Kaplan-Meier models for cca data frame
km_models_cca <- fit_km_models(cca_df, column_names)

```

```{r check logrank and HR}
# Print the Log-Rank P-value and Hazard Ratio for each variable
cat(rep("-", 40), "\n")
print("Log-Rank pvalue and HR for  Cholangiocarcinoma")
cat(rep("-", 40), "\n")
for (col in names(km_models_cca)) {
  logrank_p_cca <- log_rank_results_df$logrank_p_value[log_rank_results_df$variable == col]
  cox_HR_cca <- coxph_df_cca$cox_hazard_ratio[coxph_df_cca$column_names == col]
  cat("Variable:", col, "\n")
  cat("Log-Rank P-value:", logrank_p_cca, "\n")
  cat("Hazard Ratio:", cox_HR_cca, "\n")
}

```

```{r}
# Define the variables of interest for the  dataset
variables_of_interest <- c("Albumin_3.5", "LMR_2.1", "MON_0.8", "LY_1.5", "ANC_4", "ANC_8", 
                                     "NLR_3", "NLR_5", "PLT_300", "Alk_Phos_135", "Alk_Phos_200", 
                                     "Age_60", "Age_65", "Age_70", "Cancer_Status")
```


```{r KM Plot }
# Open a PDF device
pdf("output/cca_km_plots.pdf")

# Loop through each variable and save its plot on a separate page
for (variable in variables_of_interest) {
  legend_label <- legend_labels_cca[[variable]]
  logrank_p <- log_rank_results_df$logrank_p_value[log_rank_results_df$variable == variable]
  cox_HR <- coxph_df_cca$cox_hazard_ratio[coxph_df_cca$column_names == variable]
  
  # Extract the variable values from the dataframe
  variable_values <- cca_df[[variable]]
  
  # Make sure the event status is logical
  cca_df$Event_Status <- as.logical(cca_df$Event_Status)
  
  # Create the survival object
  surv_obj <- Surv(time = cca_df$time_diff_months, event = cca_df$Event_Status)
  
  # Fit Kaplan-Meier model
  km_fit <- survfit(surv_obj ~ variable_values, data = cca_df)
  
  # Convert p-value and hazard ratio to scientific notation with 3 significant figures
  logrank_p <- format(logrank_p, scientific = TRUE, digits = 3)
  cox_HR <- format(cox_HR, scientific = TRUE, digits = 3)
  
  # Create Kaplan-Meier plot
  km_plot <- ggsurvplot(
    km_fit,
    data = cca_df,
    title = paste("Kaplan-Meier Curve of  Cholangiocarcinoma by", variable),
    censor = TRUE,
    xlab = "Time (Months)",
    ylab = "Survival Probability",
    conf.int = TRUE,
    conf.int.style = "step",
    conf.int.alpha = 0.2,
    surv.median.line = "hv",
    xlim = c(0, 24),
    break.time.by = 3,
    breaks = seq(0, 24, by = 3),
    surv.scale = "percent",
    legend.labs = c(legend_label[1], legend_label[2]),  # Assuming two groups for now
    palette = "lancet"
  )
  
  # Add annotation to the plot
  km_plot <- km_plot$plot + annotate(
    "text", x = 0, y = 0.05, 
    label = paste("Log-Rank p-value:", logrank_p, "\nHazard Ratio:", cox_HR), 
    hjust = 0, vjust = 0
  )
  
  # Apply custom theme
  km_plot <- km_plot + ggsurvfit::theme_ggsurvfit_KMunicate()
  
  # Save the plot on a separate page
  print(km_plot)
}

# Close the PDF device
dev.off()


```


```{r plot ALAN Score}
# Define the variable of interest
variable <- "Prognostic_Score_ALAN_category"

# Extract legend labels for the variable
legend_label <- legend_labels_cca[[variable]]

# Extract Log-Rank p-value and Hazard Ratio for the variable
logrank_p <- log_rank_results_df$logrank_p_value[log_rank_results_df$variable == variable]
cox_HR <- coxph_df_cca$cox_hazard_ratio[coxph_df_cca$column_names == variable]

# Extract variable values from the dataframe
variable_values <- cca_df[[variable]]

# Create the survival object
surv_obj <- Surv(time = cca_df$time_diff_months, event = cca_df$Event_Status)

# Fit Kaplan-Meier model
km_fit <- survfit(surv_obj ~ variable_values, data = cca_df)

# Convert p-value and hazard ratio to scientific notation with 3 significant figures
logrank_p <- format(logrank_p, scientific = TRUE, digits = 3)
cox_HR <- format(cox_HR, scientific = TRUE, digits = 3)

# Create Kaplan-Meier plot
km_plot <- ggsurvplot(
  km_fit,
  data = cca_df,
  title = paste("Kaplan-Meier Curve of Cholangiocarcinoma by ALAN Score"),
  censor = TRUE,
  xlab = "Time (Months)",
  ylab = "Survival Probability",
  conf.int = TRUE,
  conf.int.style = "step",
  conf.int.alpha = 0.2,
  surv.median.line = "hv",
  xlim = c(0, 24),
  break.time.by = 3,
  breaks = seq(0, 24, by = 3),
  surv.scale = "percent",
  legend.labs = legend_label,  # Assuming three groups for "Prognostic_Score_ALAN_category"
  palette = "lancet"
)

# Add annotation to the plot
km_plot <- km_plot$plot + annotate(
  "text", x = 0, y = 0.05, 
  label = paste("Log-Rank p-value:", logrank_p, "\nHazard Ratio:", cox_HR), 
  hjust = 0, vjust = 0
)

# Apply custom theme
km_plot <- km_plot + ggsurvfit::theme_ggsurvfit_KMunicate()

# Display the plot
print(km_plot)

ggsave(filename = "output/cholangio_ALAN_kmplot.png", plot = km_plot, width = 10, height = 6)
```