import polars as pl
import rtflite as rtf
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from scipy import stats as scipy_stats10 ANCOVA efficacy analysis
Create ANCOVA efficacy analysis tables to evaluate treatment effects while controlling for baseline covariates. Learn to perform ANCOVA modeling with missing data imputation using Python statistical libraries and present results in regulatory format with rtflite.
10.1 Overview
Analysis of Covariance (ANCOVA) is the primary statistical method for efficacy evaluation in clinical trials. Following ICH E9 guidance on statistical principles for clinical trials, ANCOVA provides a robust framework for comparing treatment effects while controlling for baseline covariates.
Key features of ANCOVA efficacy analysis include:
- Covariate adjustment: Controls for baseline values to reduce variability
- Least squares means: Provides treatment effect estimates adjusted for covariates
- Missing data handling: Implements appropriate imputation strategies (e.g., LOCF, MMRM)
- Pairwise comparisons: Tests specific treatment contrasts with confidence intervals
- Regulatory compliance: Follows statistical analysis plan specifications
This tutorial demonstrates how to create a comprehensive ANCOVA efficacy table for glucose change from baseline using Python’s statistical libraries and rtflite for regulatory-compliant formatting.
10.2 Setup
polars.config.Config
adsl = pl.read_parquet("data/adsl.parquet")
adlbc = pl.read_parquet("data/adlbc.parquet")
treatments = ["Placebo", "Xanomeline Low Dose", "Xanomeline High Dose"]10.3 Step 1: Explore laboratory data structure
We start by understanding the glucose data structure and endpoint definitions.
# Display key laboratory variables
# Key ADLBC variables for efficacy analysis
lab_vars = adlbc.select(["USUBJID", "PARAMCD", "PARAM", "AVISIT", "AVISITN", "AVAL", "BASE", "CHG"])
lab_vars| USUBJID | PARAMCD | PARAM | AVISIT | AVISITN | AVAL | BASE | CHG |
|---|---|---|---|---|---|---|---|
| str | str | str | str | f64 | f64 | f64 | f64 |
| "01-701-1015" | "SODIUM" | "Sodium (mmol/L)" | " Baseline" | 0.0 | 140.0 | 140.0 | null |
| "01-701-1015" | "K" | "Potassium (mmol/L)" | " Baseline" | 0.0 | 4.5 | 4.5 | null |
| "01-701-1015" | "CL" | "Chloride (mmol/L)" | " Baseline" | 0.0 | 106.0 | 106.0 | null |
| … | … | … | … | … | … | … | … |
| "01-718-1427" | "_CK" | "Creatine Kinase (U/L) change f… | " Week 8" | 8.0 | -0.5 | null | null |
| "01-718-1427" | "_CK" | "Creatine Kinase (U/L) change f… | "End of Treatment" | 99.0 | -0.5 | null | null |
# Focus on glucose parameter
gluc_visits = adlbc.filter(pl.col("PARAMCD") == "GLUC").select("AVISIT", "AVISITN").unique().sort("AVISITN")
# Available glucose measurement visits
gluc_visits| AVISIT | AVISITN |
|---|---|
| str | f64 |
| " ." | null |
| " Baseline" | 0.0 |
| " Week 2" | 2.0 |
| … | … |
| " Week 26" | 26.0 |
| "End of Treatment" | 99.0 |
10.4 Step 2: Define analysis population and endpoint
Following the Statistical Analysis Plan, we focus on the efficacy population for the primary endpoint.
# Clean data types and prepare datasets
adlbc_clean = adlbc.with_columns([
pl.col(c).cast(str).str.strip_chars()
for c in ["USUBJID", "PARAMCD", "AVISIT", "TRTP"]
])
# Define efficacy population
adsl_eff = adsl.filter(pl.col("EFFFL") == "Y").select(["USUBJID"])
# Efficacy population size
adsl_eff.height
# Filter laboratory data to efficacy population
adlbc_eff = adlbc_clean.join(adsl_eff, on="USUBJID", how="inner")
# Laboratory records in efficacy population
adlbc_eff.height71882
# Examine glucose data availability by visit and treatment
gluc_availability = (
adlbc_eff.filter(pl.col("PARAMCD") == "GLUC")
.group_by(["TRTP", "AVISIT"])
.agg(n_subjects=pl.col("USUBJID").n_unique())
.sort(["TRTP", "AVISIT"])
)
# Glucose data availability by visit
gluc_availability| TRTP | AVISIT | n_subjects |
|---|---|---|
| str | str | u32 |
| "Placebo" | "." | 13 |
| "Placebo" | "Baseline" | 79 |
| "Placebo" | "End of Treatment" | 79 |
| … | … | … |
| "Xanomeline Low Dose" | "Week 6" | 62 |
| "Xanomeline Low Dose" | "Week 8" | 59 |
10.5 Step 3: Implement LOCF imputation strategy
We apply Last Observation Carried Forward (LOCF) for missing Week 24 glucose values.
# Prepare glucose data with LOCF for Week 24 endpoint
gluc_data = (
adlbc_eff
.filter((pl.col("PARAMCD") == "GLUC") & (pl.col("AVISITN") <= 24))
.sort(["USUBJID", "AVISITN"])
.group_by("USUBJID")
.agg([
pl.col("TRTP").first(),
pl.col("BASE").first(),
pl.col("AVAL").filter(pl.col("AVISITN") == 0).first().alias("Baseline"),
pl.col("AVAL").last().alias("Week_24_LOCF"), # LOCF: last available value <= Week 24
pl.col("AVISITN").max().alias("Last_Visit") # Track actual last visit
])
.filter(pl.col("Baseline").is_not_null() & pl.col("Week_24_LOCF").is_not_null())
.with_columns((pl.col("Week_24_LOCF") - pl.col("Baseline")).alias("CHG"))
)
# Subjects with baseline and Week 24 (LOCF) glucose
gluc_data.height
# Sample of prepared analysis data
gluc_data| USUBJID | TRTP | BASE | Baseline | Week_24_LOCF | Last_Visit | CHG |
|---|---|---|---|---|---|---|
| str | str | f64 | f64 | f64 | f64 | f64 |
| "01-701-1015" | "Placebo" | 4.71835 | 4.71835 | 4.49631 | 24.0 | -0.22204 |
| "01-701-1023" | "Placebo" | 5.32896 | 5.32896 | 5.43998 | 4.0 | 0.11102 |
| "01-701-1028" | "Xanomeline High Dose" | 4.77386 | 4.77386 | 5.43998 | 24.0 | 0.66612 |
| … | … | … | … | … | … | … |
| "01-718-1371" | "Xanomeline High Dose" | 6.27263 | 6.27263 | 5.10692 | 12.0 | -1.16571 |
| "01-718-1427" | "Xanomeline High Dose" | 4.55182 | 4.55182 | 3.71917 | 8.0 | -0.83265 |
# Assess LOCF imputation impact
locf_summary = (
gluc_data
.group_by(["TRTP", "Last_Visit"])
.agg(n_subjects=pl.len())
.sort(["TRTP", "Last_Visit"])
)
# LOCF imputation summary (last actual visit used)
locf_summary| TRTP | Last_Visit | n_subjects |
|---|---|---|
| str | f64 | u32 |
| "Placebo" | 2.0 | 2 |
| "Placebo" | 4.0 | 2 |
| "Placebo" | 6.0 | 2 |
| … | … | … |
| "Xanomeline Low Dose" | 20.0 | 5 |
| "Xanomeline Low Dose" | 24.0 | 25 |
10.6 Step 4: Calculate descriptive statistics
We compute baseline, Week 24, and change from baseline statistics by treatment group.
# Calculate comprehensive descriptive statistics
desc_stats = []
for trt in treatments:
# Analysis data for this treatment
trt_data = gluc_data.filter(pl.col("TRTP") == trt)
# Original baseline data (all subjects with baseline)
baseline_full = adlbc_eff.filter(
(pl.col("PARAMCD") == "GLUC") &
(pl.col("AVISIT") == "Baseline") &
(pl.col("TRTP") == trt)
)
desc_stats.append({
"Treatment": trt,
"N_Baseline": baseline_full.height,
"Baseline_Mean": baseline_full["AVAL"].mean() if baseline_full.height > 0 else np.nan,
"Baseline_SD": baseline_full["AVAL"].std() if baseline_full.height > 0 else np.nan,
"N_Week24": trt_data.height,
"Week24_Mean": trt_data["Week_24_LOCF"].mean() if trt_data.height > 0 else np.nan,
"Week24_SD": trt_data["Week_24_LOCF"].std() if trt_data.height > 0 else np.nan,
"N_Change": trt_data.height,
"Change_Mean": trt_data["CHG"].mean() if trt_data.height > 0 else np.nan,
"Change_SD": trt_data["CHG"].std() if trt_data.height > 0 else np.nan
})
# Display descriptive statistics
desc_df = pl.DataFrame(desc_stats)
# Descriptive statistics by treatment
desc_df| Treatment | N_Baseline | Baseline_Mean | Baseline_SD | N_Week24 | Week24_Mean | Week24_SD | N_Change | Change_Mean | Change_SD |
|---|---|---|---|---|---|---|---|---|---|
| str | i64 | f64 | f64 | i64 | f64 | f64 | i64 | f64 | f64 |
| "Placebo" | 79 | 5.656399 | 2.229324 | 79 | 5.639535 | 1.651807 | 79 | -0.016864 | 2.31841 |
| "Xanomeline Low Dose" | 79 | 5.419603 | 0.946102 | 79 | 5.352148 | 1.058004 | 79 | -0.067455 | 1.015715 |
| "Xanomeline High Dose" | 74 | 5.388971 | 1.374893 | 74 | 5.831551 | 2.214668 | 74 | 0.44258 | 1.645179 |
10.7 Step 5: Perform ANCOVA analysis
We fit the ANCOVA model with treatment and baseline glucose as covariates.
# Convert to pandas for statsmodels compatibility
ancova_df = gluc_data.to_pandas()
ancova_df["TRTP"] = pd.Categorical(ancova_df["TRTP"], categories=treatments)
# Fit ANCOVA model: Change = Treatment + Baseline
model = smf.ols("CHG ~ TRTP + BASE", data=ancova_df).fit()
# Display model summary
# ANCOVA Model Summary
model.rsquared
model.fvalue, model.f_pvalue
# Model coefficients
model.summary().tables[1]| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 2.9972 | 0.392 | 7.642 | 0.000 | 2.224 | 3.770 |
| TRTP[T.Xanomeline Low Dose] | -0.1768 | 0.243 | -0.729 | 0.467 | -0.655 | 0.301 |
| TRTP[T.Xanomeline High Dose] | 0.3169 | 0.247 | 1.284 | 0.200 | -0.169 | 0.803 |
| BASE | -0.5329 | 0.062 | -8.543 | 0.000 | -0.656 | -0.410 |
# Calculate adjusted means (LS means) at mean baseline value
base_mean = ancova_df["BASE"].mean()
var_cov = model.cov_params()
ls_means = []
# LS means calculated at baseline mean
base_mean
for i, trt in enumerate(treatments):
# Create prediction vector for LS mean calculation
# Model: CHG = intercept + trt_effect1*(trt==1) + trt_effect2*(trt==2) + base_effect*baseline
x_pred = np.array([1, int(i==1), int(i==2), base_mean])
# Calculate LS mean
ls_mean = model.predict(pd.DataFrame({"TRTP": [trt], "BASE": [base_mean]}))[0]
# Calculate standard error for confidence interval
se_pred = np.sqrt(x_pred @ var_cov @ x_pred.T)
ls_means.append({
"Treatment": trt,
"LS_Mean": ls_mean,
"SE": se_pred,
"CI_Lower": ls_mean - 1.96 * se_pred,
"CI_Upper": ls_mean + 1.96 * se_pred
})
ls_means_df = pl.DataFrame(ls_means)
# LS Means (95% CI)
ls_means_df| Treatment | LS_Mean | SE | CI_Lower | CI_Upper |
|---|---|---|---|---|
| str | f64 | f64 | f64 | f64 |
| "Placebo" | 0.071554 | 0.171563 | -0.264709 | 0.407818 |
| "Xanomeline Low Dose" | -0.105215 | 0.171308 | -0.440977 | 0.230548 |
| "Xanomeline High Dose" | 0.388498 | 0.177055 | 0.041471 | 0.735525 |
10.8 Step 6: Pairwise treatment comparisons
We calculate treatment differences and their statistical significance.
# Calculate pairwise comparisons vs. placebo
tbl2_data = []
comparisons = [
("Xanomeline Low Dose vs. Placebo", "TRTP[T.Xanomeline Low Dose]"),
("Xanomeline High Dose vs. Placebo", "TRTP[T.Xanomeline High Dose]")
]
for comp_name, trt_coef in comparisons:
# Extract coefficient estimates
coef = model.params[trt_coef]
se = model.bse[trt_coef]
t_stat = coef / se
df = model.df_resid
p_value = 2 * (1 - scipy_stats.t.cdf(abs(t_stat), df))
# Calculate confidence interval
ci_lower = coef - scipy_stats.t.ppf(0.975, df) * se
ci_upper = coef + scipy_stats.t.ppf(0.975, df) * se
tbl2_data.append({
"Comparison": comp_name,
"Estimate": coef,
"SE": se,
"CI_Lower": ci_lower,
"CI_Upper": ci_upper,
"t_stat": t_stat,
"p_value": p_value
})
comparison_df = pl.DataFrame(tbl2_data)
# Treatment comparisons vs. placebo
comparison_df| Comparison | Estimate | SE | CI_Lower | CI_Upper | t_stat | p_value |
|---|---|---|---|---|---|---|
| str | f64 | f64 | f64 | f64 | f64 | f64 |
| "Xanomeline Low Dose vs. Placeb… | -0.176769 | 0.242635 | -0.654862 | 0.301324 | -0.728539 | 0.467031 |
| "Xanomeline High Dose vs. Place… | 0.316943 | 0.246806 | -0.169369 | 0.803256 | 1.284179 | 0.200383 |
10.9 Step 7: Prepare tables for RTF output
We format the analysis results into publication-ready tables.
# Table 1: Descriptive Statistics and LS Means
tbl1_data = []
for s, ls in zip(desc_stats, ls_means):
tbl1_data.append([
s["Treatment"],
str(s["N_Baseline"]),
f"{s['Baseline_Mean']:.1f} ({s['Baseline_SD']:.2f})" if not np.isnan(s['Baseline_Mean']) else "",
str(s["N_Week24"]),
f"{s['Week24_Mean']:.1f} ({s['Week24_SD']:.2f})" if not np.isnan(s['Week24_Mean']) else "",
str(s["N_Change"]),
f"{s['Change_Mean']:.1f} ({s['Change_SD']:.2f})" if not np.isnan(s['Change_Mean']) else "",
f"{ls['LS_Mean']:.2f} ({ls['CI_Lower']:.2f}, {ls['CI_Upper']:.2f})"
])
tbl1 = pl.DataFrame(tbl1_data, orient="row", schema=[
"Treatment", "N_Base", "Mean_SD_Base", "N_Wk24", "Mean_SD_Wk24",
"N_Chg", "Mean_SD_Chg", "LS_Mean_CI"
])
# Table 1 - Descriptive Statistics and LS Means
tbl1| Treatment | N_Base | Mean_SD_Base | N_Wk24 | Mean_SD_Wk24 | N_Chg | Mean_SD_Chg | LS_Mean_CI |
|---|---|---|---|---|---|---|---|
| str | str | str | str | str | str | str | str |
| "Placebo" | "79" | "5.7 (2.23)" | "79" | "5.6 (1.65)" | "79" | "-0.0 (2.32)" | "0.07 (-0.26, 0.41)" |
| "Xanomeline Low Dose" | "79" | "5.4 (0.95)" | "79" | "5.4 (1.06)" | "79" | "-0.1 (1.02)" | "-0.11 (-0.44, 0.23)" |
| "Xanomeline High Dose" | "74" | "5.4 (1.37)" | "74" | "5.8 (2.21)" | "74" | "0.4 (1.65)" | "0.39 (0.04, 0.74)" |
# Table 2: Pairwise Comparisons (formatted for output)
tbl2_formatted = []
for row in tbl2_data:
tbl2_formatted.append([
row["Comparison"],
f"{row['Estimate']:.2f} ({row['CI_Lower']:.2f}, {row['CI_Upper']:.2f})",
f"{row['p_value']:.4f}" if row['p_value'] >= 0.0001 else "<0.0001"
])
tbl2 = pl.DataFrame(tbl2_formatted, orient="row", schema=["Comparison", "Diff_CI", "P_Value"])
# Table 2 - Pairwise Comparisons
tbl2| Comparison | Diff_CI | P_Value |
|---|---|---|
| str | str | str |
| "Xanomeline Low Dose vs. Placeb… | "-0.18 (-0.65, 0.30)" | "0.4670" |
| "Xanomeline High Dose vs. Place… | "0.32 (-0.17, 0.80)" | "0.2004" |
10.10 Step 8: Create regulatory-compliant RTF document
We generate a comprehensive efficacy table following regulatory submission standards.
# Create comprehensive RTF document with multiple table sections
doc_ancova = rtf.RTFDocument(
df=[tbl1, tbl2],
rtf_title=rtf.RTFTitle(
text=[
"ANCOVA of Change from Baseline in",
"Fasting Glucose (mmol/L) at Week 24 (LOCF)",
"Efficacy Analysis Population"
]
),
rtf_column_header=[
# Header for descriptive statistics table
[
rtf.RTFColumnHeader(
text=["", "Baseline", "Week 24 (LOCF)", "Change from Baseline", ""],
col_rel_width=[3, 2, 2, 2, 2],
text_justification=["l", "c", "c", "c", "c"],
text_format="b"
),
rtf.RTFColumnHeader(
text=[
"Treatment Group",
"N", "Mean (SD)",
"N", "Mean (SD)",
"N", "Mean (SD)",
"LS Mean (95% CI){^a}"
],
col_rel_width=[3, 0.7, 1.3, 0.7, 1.3, 0.7, 1.3, 2],
text_justification=["l"] + ["c"] * 7,
border_bottom="single",
text_format="b"
)
],
# Header for pairwise comparisons table
[
rtf.RTFColumnHeader(
text=[
"Pairwise Comparison",
"Difference in LS Mean (95% CI){^a}",
"p-Value{^b}"
],
col_rel_width=[5, 4, 2],
text_justification=["l", "c", "c"],
text_format="b",
border_bottom="single"
)
]
],
rtf_body=[
# Body for descriptive statistics
rtf.RTFBody(
col_rel_width=[3, 0.7, 1.3, 0.7, 1.3, 0.7, 1.3, 2],
text_justification=["l"] + ["c"] * 7
),
# Body for pairwise comparisons
rtf.RTFBody(
col_rel_width=[5, 4, 2],
text_justification=["l", "c", "c"]
)
],
rtf_footnote=rtf.RTFFootnote(
text=[
"{^a}LS means and differences in LS means are based on an ANCOVA model with treatment and baseline glucose as covariates.",
f"{{^b}}p-values are from the ANCOVA model testing treatment effects",
"LOCF (Last Observation Carried Forward) approach is used for missing Week 24 values.",
"ANCOVA = Analysis of Covariance; CI = Confidence Interval; LS = Least Squares; SD = Standard Deviation"
]
),
rtf_source=rtf.RTFSource(
text=[
"Source: ADLBC Analysis Dataset",
f"Analysis conducted: {pd.Timestamp.now().strftime('%d%b%Y').upper()}",
"Statistical software: Python (statsmodels)"
]
)
)
# Generate RTF file
doc_ancova.write_rtf("rtf/tlf_efficacy_ancova.rtf")rtf/tlf_efficacy_ancova.rtf
PosixPath('pdf/tlf_efficacy_ancova.pdf')