4  Baseline Characteristics Table

4.1 Overview

Baseline characteristics tables summarize the demographic and clinical characteristics of study participants at enrollment. Following ICH E3 guidance, these tables are essential for understanding the study population and assessing comparability between treatment groups.

This tutorial shows you how to create a baseline characteristics table using Python’s rtflite package.

import polars as pl # Data manipulation
import rtflite as rtf # RTF reporting

4.2 Step 1: Load Data

We start by loading the Subject-level Analysis Dataset (ADSL) and filtering to the safety population.

adsl = (
    pl.read_parquet("data/adsl.parquet")
    .select(["USUBJID", "TRT01P", "AGE", "SEX", "RACE"])
)

adsl
shape: (254, 5)
USUBJID TRT01P AGE SEX RACE
str str f64 str str
"01-701-1015" "Placebo" 63.0 "Female" "White"
"01-701-1023" "Placebo" 64.0 "Male" "White"
"01-701-1028" "Xanomeline High Dose" 71.0 "Male" "White"
"01-701-1033" "Xanomeline Low Dose" 74.0 "Male" "White"
"01-701-1034" "Xanomeline High Dose" 77.0 "Female" "White"
"01-718-1254" "Xanomeline Low Dose" 78.0 "Male" "White"
"01-718-1328" "Xanomeline High Dose" 86.0 "Male" "White"
"01-718-1355" "Placebo" 79.0 "Male" "White"
"01-718-1371" "Xanomeline High Dose" 69.0 "Female" "White"
"01-718-1427" "Xanomeline High Dose" 74.0 "Female" "Black Or African American"

4.3 Step 2: Calculate Summary Statistics

We’ll create separate functions to handle continuous and categorical variables.

4.3.1 Continuous Variables (Age)

For continuous variables, we calculate mean (SD) and median [min, max].

def summarize_continuous(df, var):
    """Calculate summary statistics for continuous variables"""
    return df.group_by("TRT01P").agg([
        pl.col(var).mean().round(1).alias("mean"),
        pl.col(var).std().round(2).alias("sd"),
        pl.col(var).median().alias("median"),
        pl.col(var).min().alias("min"),
        pl.col(var).max().alias("max"),
        pl.len().alias("n")
    ])

age_stats = summarize_continuous(adsl, "AGE")
age_stats
shape: (3, 7)
TRT01P mean sd median min max n
str f64 f64 f64 f64 f64 u32
"Placebo" 75.2 8.59 76.0 52.0 89.0 86
"Xanomeline Low Dose" 75.7 8.29 77.5 51.0 88.0 84
"Xanomeline High Dose" 74.4 7.89 76.0 56.0 88.0 84

4.3.2 Categorical Variables (Sex, Race)

For categorical variables, we calculate counts and percentages.

def summarize_categorical(df, var):
    """Calculate counts and percentages for categorical variables"""
    # Get counts by treatment and category
    counts = df.group_by(["TRT01P", var]).len()

    # Get treatment totals for percentage calculations
    totals = df.group_by("TRT01P").len().rename({"len": "total"})

    # Calculate percentages
    result = counts.join(totals, on="TRT01P").with_columns([
        (100.0 * pl.col("len") / pl.col("total")).round(1).alias("pct")
    ])

    return result

sex_stats = summarize_categorical(adsl, "SEX")
sex_stats
shape: (6, 5)
TRT01P SEX len total pct
str str u32 u32 f64
"Xanomeline High Dose" "Male" 44 84 52.4
"Xanomeline Low Dose" "Female" 50 84 59.5
"Xanomeline Low Dose" "Male" 34 84 40.5
"Placebo" "Female" 53 86 61.6
"Placebo" "Male" 33 86 38.4
"Xanomeline High Dose" "Female" 40 84 47.6
race_stats = summarize_categorical(adsl, "RACE")
race_stats
shape: (7, 5)
TRT01P RACE len total pct
str str u32 u32 f64
"Xanomeline High Dose" "American Indian Or Alaska Nati… 1 84 1.2
"Xanomeline High Dose" "Black Or African American" 9 84 10.7
"Placebo" "Black Or African American" 8 86 9.3
"Placebo" "White" 78 86 90.7
"Xanomeline Low Dose" "Black Or African American" 6 84 7.1
"Xanomeline High Dose" "White" 74 84 88.1
"Xanomeline Low Dose" "White" 78 84 92.9

4.4 Step 3: Format Results

Now we format the statistics into the standard baseline table format.

4.4.1 Format Age Statistics

# Format age as "Mean (SD)" and "Median [Min, Max]"
age_formatted = age_stats.with_columns([
    pl.format("{} ({})", pl.col("mean"), pl.col("sd")).alias("mean_sd"),
    pl.format("{} [{}, {}]", pl.col("median"), pl.col("min"), pl.col("max")).alias("median_range")
]).select(["TRT01P", "mean_sd", "median_range"])

age_formatted
shape: (3, 3)
TRT01P mean_sd median_range
str str str
"Placebo" "75.2 (8.59)" "76.0 [52.0, 89.0]"
"Xanomeline Low Dose" "75.7 (8.29)" "77.5 [51.0, 88.0]"
"Xanomeline High Dose" "74.4 (7.89)" "76.0 [56.0, 88.0]"

4.4.2 Format Categorical Statistics

# Format categorical as "n (%)"
sex_formatted = sex_stats.with_columns(
    pl.format("{} ({}%)", pl.col("len"), pl.col("pct")).alias("n_pct")
).select(["TRT01P", "SEX", "n_pct"])

race_formatted = race_stats.with_columns(
    pl.format("{} ({}%)", pl.col("len"), pl.col("pct")).alias("n_pct")
).select(["TRT01P", "RACE", "n_pct"])

sex_formatted
shape: (6, 3)
TRT01P SEX n_pct
str str str
"Xanomeline High Dose" "Male" "44 (52.4%)"
"Xanomeline Low Dose" "Female" "50 (59.5%)"
"Xanomeline Low Dose" "Male" "34 (40.5%)"
"Placebo" "Female" "53 (61.6%)"
"Placebo" "Male" "33 (38.4%)"
"Xanomeline High Dose" "Female" "40 (47.6%)"

4.5 Step 4: Create Table Structure

We’ll build the table row by row following the standard baseline table format.

# Helper function to get value for a treatment group
def get_value(df, treatment):
    """Get value for a specific treatment group or return default"""
    result = df.filter(pl.col("TRT01P") == treatment)
    return result[result.columns[-1]][0] if result.height > 0 else "0 (0.0%)"

# Build the baseline table structure
table_rows = []

# Age section
table_rows.append(["Age (years)", "", "", ""])

# Age Mean (SD) row
age_mean_row = ["  Mean (SD)"] + [
    get_value(age_formatted.select(["TRT01P", "mean_sd"]), trt).replace("0 (0.0%)", "")
    for trt in ["Placebo", "Xanomeline Low Dose", "Xanomeline High Dose"]
]
table_rows.append(age_mean_row)

# Age Median [Min, Max] row
age_median_row = ["  Median [Min, Max]"] + [
    get_value(age_formatted.select(["TRT01P", "median_range"]), trt).replace("0 (0.0%)", "")
    for trt in ["Placebo", "Xanomeline Low Dose", "Xanomeline High Dose"]
]
table_rows.append(age_median_row)

# Sex section
table_rows.append(["Sex", "", "", ""])

for sex_cat in ["Female", "Male"]:
    sex_data = sex_formatted.filter(pl.col("SEX") == sex_cat)
    sex_row = [f"  {sex_cat}"] + [
        get_value(sex_data, trt)
        for trt in ["Placebo", "Xanomeline Low Dose", "Xanomeline High Dose"]
    ]
    table_rows.append(sex_row)

# Race section
table_rows.append(["Race", "", "", ""])

for race_cat in ["White", "Black Or African American", "American Indian Or Alaska Native"]:
    race_data = race_formatted.filter(pl.col("RACE") == race_cat)
    race_row = [f"  {race_cat}"] + [
        get_value(race_data, trt)
        for trt in ["Placebo", "Xanomeline Low Dose", "Xanomeline High Dose"]
    ]
    table_rows.append(race_row)

# Create DataFrame from table rows
baseline_table = pl.DataFrame(
    table_rows,
    schema=["Characteristic", "Placebo", "Xanomeline Low Dose", "Xanomeline High Dose"],
    orient="row"
)

baseline_table
shape: (10, 4)
Characteristic Placebo Xanomeline Low Dose Xanomeline High Dose
str str str str
"Age (years)" "" "" ""
"  Mean (SD)" "75.2 (8.59)" "75.7 (8.29)" "74.4 (7.89)"
"  Median [Min, Max]" "76.0 [52.0, 89.0]" "77.5 [51.0, 88.0]" "76.0 [56.0, 88.0]"
"Sex" "" "" ""
"  Female" "53 (61.6%)" "50 (59.5%)" "40 (47.6%)"
"  Male" "33 (38.4%)" "34 (40.5%)" "44 (52.4%)"
"Race" "" "" ""
"  White" "78 (90.7%)" "78 (92.9%)" "74 (88.1%)"
"  Black Or African American" "8 (9.3%)" "6 (7.1%)" "9 (10.7%)"
"  American Indian Or Alaska Na… "0 (0.0%)" "0 (0.0%)" "1 (1.2%)"

4.6 Step 5: Generate Publication-Ready Output

Finally, we format the baseline table for regulatory submission using the rtflite package.

# Get treatment group sizes for column headers
treatment_n = adsl.group_by("TRT01P").len().sort("TRT01P")
n_placebo = treatment_n.filter(pl.col("TRT01P") == "Placebo")["len"][0]
n_low = treatment_n.filter(pl.col("TRT01P") == "Xanomeline Low Dose")["len"][0]
n_high = treatment_n.filter(pl.col("TRT01P") == "Xanomeline High Dose")["len"][0]

doc_baseline = rtf.RTFDocument(
    df=baseline_table,
    rtf_title=rtf.RTFTitle(
        text=[
            "Baseline Characteristics of Participants", 
            "(All Participants Randomized)"
        ]
    ),
    rtf_column_header=rtf.RTFColumnHeader(
        text=[
            "Characteristic",
            f"Placebo\n(N={n_placebo})",
            f"Xanomeline Low Dose\n(N={n_low})",
            f"Xanomeline High Dose\n(N={n_high})"
        ],
        text_justification=["l", "c", "c", "c"],
        col_rel_width=[3, 2, 2, 2]
    ),
    rtf_body=rtf.RTFBody(
        text_justification=["l", "c", "c", "c"],
        col_rel_width=[3, 2, 2, 2]
    ),
    rtf_source=rtf.RTFSource(text=["Source: ADSL dataset"])
)

doc_baseline.write_rtf("rtf/tlf_baseline.rtf") # Save as RTF for submission
rtf/tlf_baseline.rtf