Skip to content

Utils

This module contains general utility functions.

utils

Functions:

bold_hdi_values

bold_hdi_values(row: dict, hdi_lower_col: str, hdi_upper_col: str) -> dict

Formats the lower and upper values of a row's HDI (Highest Density Interval) with double asterisks if they have the same sign.

Parameters:

  • row

    (dict) –

    A dictionary representing a row of data.

  • hdi_lower_col

    (str) –

    The column name for the lower HDI value.

  • hdi_upper_col

    (str) –

    The column name for the upper HDI value.

Returns:

  • dict

    The modified row dictionary with the HDI values formatted with double

  • dict

    asterisks if they have the same sign.

Source code in stats_utils/utils.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def bold_hdi_values(row: dict, hdi_lower_col: str, hdi_upper_col: str) -> dict:
    """
    Formats the lower and upper values of a row's HDI (Highest Density
    Interval) with double asterisks if they have the same sign.

    Args:
        row: A dictionary representing a row of data.
        hdi_lower_col: The column name for the lower HDI value.
        hdi_upper_col: The column name for the upper HDI value.

    Returns:
        The modified row dictionary with the HDI values formatted with double
        asterisks if they have the same sign.
    """
    hdi_lower = float(row[hdi_lower_col])
    hdi_upper = float(row[hdi_upper_col])

    # Check if both HDI values have the same sign
    if np.sign(hdi_lower) == np.sign(hdi_upper):
        row[hdi_lower_col] = f"**{row[hdi_lower_col]}**"
        row[hdi_upper_col] = f"**{row[hdi_upper_col]}**"

    return row

dataframe_to_markdown

dataframe_to_markdown(df: DataFrame, round_dict: dict, rename_dict: dict, pval_columns: Dict[str, float] = None, hdi_columns: List[str] = None, repeated_value_columns: List[str] = None, rename_index: str = 'Predictor') -> str

Processes a pandas DataFrame containing output from some type of statistical model by rounding specified columns, renaming columns, and converting the DataFrame to a markdown table string.

Parameters:

  • df

    (DataFrame) –

    The DataFrame to process.

  • round_dict

    (dict) –

    A dictionary specifying the number of decimal places for each column to round to. Example: {"column1": 2, "column2": 3}

  • rename_dict

    (dict) –

    A dictionary specifying the new column names with optional LaTeX formatting. Example: {"column1": "$column_{1}$", "column2": "$column_{2}$"}

  • pval_columns

    (Dict[str, float], default: None ) –

    A dictionary specifying the significance level for each p-value column. If specified, the column will be converted to a string and significant values will be bolded. Example: {"pval": 0.05, "pval_corr": 0.01}

  • hdi_columns

    (List[str], default: None ) –

    A list of column names representing highest density intervals (HDIs) that should be highlighted to show "significant" values. Should have two entries where the first corresponds to the lower HDI and the second corresponds to the upper HDI. Defaults to [].

  • repeated_value_columns

    (List[str], default: None ) –

    A list of column names that should be formatted to show repeated values. For example, if we have multiple target variables and the same predictor variables, we can format the target variables to show repeated values. Defaults to [].

  • rename_index

    (str, default: 'Predictor' ) –

    The name to give to the index column. Defaults to "Predictor". If None, the index is dropped.

Returns:

  • str ( str ) –

    A string representing the DataFrame in markdown format.

Example

df = pd.DataFrame(...) round_dict = {"df_resid": 0, "ssr": 2, "ss_diff": 2, "F": 2, "Pr(>F)": 3} rename_dict = {"df_resid": "$df_{R}$", "ssr": "$SS_{R}$", "ss_diff": "$SS_{diff}$", "F": "$F$", "Pr(>F)": "$p$"} markdown_str = dataframe_to_latex(df, round_dict, rename_dict, 'p>|t|')

Source code in stats_utils/utils.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
def dataframe_to_markdown(
    df: pd.DataFrame,
    round_dict: dict,
    rename_dict: dict,
    pval_columns: Dict[str, float] = None,
    hdi_columns: List[str] = None,
    repeated_value_columns: List[str] = None,
    rename_index: str = "Predictor",
) -> str:
    """
    Processes a pandas DataFrame containing output from some type of
    statistical model by rounding specified columns, renaming columns,
    and converting the DataFrame to a markdown table string.

    Args:
        df (pd.DataFrame): The DataFrame to process.
        round_dict (dict): A dictionary specifying the number of decimal
            places for each column to round to. Example: `{"column1": 2,
            "column2": 3}`
        rename_dict (dict): A dictionary specifying the new column names
            with optional LaTeX formatting. Example: `{"column1":
            "$column_{1}$", "column2": "$column_{2}$"}`
        pval_columns (Dict[str, float]): A dictionary specifying the
            significance level for each p-value column. If specified, the
            column will be converted to a string and significant values will
            be bolded. Example: `{"pval": 0.05, "pval_corr": 0.01}`
        hdi_columns (List[str]): A list of column names representing
            highest density intervals (HDIs) that should be highlighted
            to show "significant" values. Should have two entries
            where the first corresponds to the lower HDI and the second
            corresponds to the upper HDI. Defaults to `[]`.
        repeated_value_columns (List[str]): A list of column names that
            should be formatted to show repeated values. For example, if we
            have multiple target variables and the same predictor variables,
            we can format the target variables to show repeated values.
            Defaults to `[]`.
        rename_index (str): The name to give to the index column. Defaults
            to "Predictor". If `None`, the index is dropped.

    Returns:
        str: A string representing the DataFrame in markdown format.

    Example:
        df = pd.DataFrame(...)
        round_dict = `{"df_resid": 0, "ssr": 2, "ss_diff": 2, "F": 2,
            "Pr(>F)": 3}`
        rename_dict = `{"df_resid": "$df_{R}$", "ssr": "$SS_{R}$",
            "ss_diff": "$SS_{diff}$", "F": "$F$", "Pr(>F)": "$p$"}`
        markdown_str = dataframe_to_latex(df, round_dict, rename_dict,
            'p>|t|')
    """

    # Set pval columns to an empty dictionary if not provided
    if pval_columns is None:
        pval_columns = {}

    # If HDI columns are specified, ensure there are two columns
    if hdi_columns is not None and len(hdi_columns) != 2:
        raise ValueError("hdi_columns must contain two columns")

    # Create a copy of the DataFrame
    df = df.copy()

    # Reset index just in case it's out of order
    df = df.reset_index()

    # Rename the index column
    if rename_index is not None:
        df = df.rename(columns={"index": rename_index})
    else:
        df = df.reset_index(drop=True)
        # Remove any "index" column that might be present
        if "index" in df.columns:
            df = df.drop(columns=["index"])

    # Get rounding precision for each column as a tuple in the column order, as
    # a formatting string
    precisions = tuple([f".{round_dict.get(col, 0)}f" for col in df.columns])

    # Drop any columns that are not in the DataFrame
    round_dict = {
        col: round_dict[col] for col in round_dict if col in df.columns
    }

    # Apply custom formatting based on round_dict
    for col, decimals in round_dict.items():
        # Convert column to numeric if it is not already
        df[col] = pd.to_numeric(df[col])

        # Identify significant rows
        if col in pval_columns:
            significant_rows = df[col].values < pval_columns[col]

        # Round the column to the specified number of decimal places
        df[col] = (
            df[col]
            .apply(lambda x: f"{x:.{decimals}f}" if pd.notnull(x) else "-")
            .astype(str)
        )

        # Bold significant rows and replace 0.000 with "<0.001"
        if col in pval_columns:

            # Replace 0.000 with "<0.001"
            df[col] = df[col].apply(lambda x: "<.001" if x == "0.000" else x)

            # Bold significant rows
            df[col] = df.apply(
                lambda row: (
                    f"**{row[col]}**"
                    if row[col] != "-" and significant_rows[row.name]
                    else row[col]
                ),
                axis=1,
            )

    # Highlight "significant" HDI columns based on both
    # values having the same sign
    if hdi_columns is not None:
        df = df.apply(
            bold_hdi_values,
            axis=1,
            hdi_lower_col=hdi_columns[0],
            hdi_upper_col=hdi_columns[1],
        )

    # Format columns with repeated values
    if repeated_value_columns is not None:
        for col in repeated_value_columns:
            df = format_column_repeated_values(df, col)

    # Rename the columns
    df_renamed = df.rename(columns=rename_dict)

    # Convert to Markdown string
    return df_renamed.to_markdown(index=False, floatfmt=precisions)

format_column_repeated_values

format_column_repeated_values(df: DataFrame, col_name: str) -> DataFrame

Returns a new DataFrame where only the first occurrence of a repeated value in the given column is shown, and subsequent ones are left blank.

Parameters:

  • df

    (DataFrame) –

    Original DataFrame.

  • col_name

    (str) –

    Name of the column to format.

Returns:

  • DataFrame

    pd.DataFrame: New DataFrame with formatted first column.

Source code in stats_utils/utils.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def format_column_repeated_values(
    df: pd.DataFrame, col_name: str
) -> pd.DataFrame:
    """
    Returns a new DataFrame where only the first occurrence of a repeated value
    in the given column is shown, and subsequent ones are left blank.

    Args:
        df (pd.DataFrame): Original DataFrame.
        col_name (str): Name of the column to format.

    Returns:
        pd.DataFrame: New DataFrame with formatted first column.
    """
    # Create a copy of the DataFrame to avoid modifying the original
    formatted_df = df.copy()

    prev_value = None
    for index, value in formatted_df[col_name].items():
        if value == prev_value:
            formatted_df.at[index, col_name] = (
                ""  # Replace repeated values with empty string
            )
        else:
            prev_value = value

    return formatted_df

process_summary_table

process_summary_table(summary_df: DataFrame, predictor_rename_dict: Optional[Dict[str, str]] = None, exclude_predictors: Optional[List[str]] = None, column_rename_dict: Optional[Dict[str, str]] = None, round_dict: Optional[Dict[str, int]] = None) -> DataFrame

Process a summary table DataFrame by renaming columns, applying rounding, and filtering predictors.

Parameters:

  • summary_df

    (DataFrame) –

    The summary table as a DataFrame.

  • predictor_rename_dict

    (Optional[Dict[str, str]], default: None ) –

    A dictionary to rename the predictors in the summary table. Defaults to None.

  • exclude_predictors

    (Optional[List[str]], default: None ) –

    A list of predictors to exclude from the summary table. Defaults to [].

  • column_rename_dict

    (Optional[Dict[str, str]], default: None ) –

    A dictionary to rename the summary table columns. Defaults to None.

  • round_dict

    (Optional[Dict[str, int]], default: None ) –

    A dictionary to set the rounding precision for each column. Defaults to None.

Returns:

  • DataFrame

    pd.DataFrame: The processed summary table DataFrame.

Source code in stats_utils/utils.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def process_summary_table(
    summary_df: pd.DataFrame,
    predictor_rename_dict: Optional[Dict[str, str]] = None,
    exclude_predictors: Optional[List[str]] = None,
    column_rename_dict: Optional[Dict[str, str]] = None,
    round_dict: Optional[Dict[str, int]] = None,
) -> pd.DataFrame:
    """
    Process a summary table DataFrame by renaming columns, applying rounding,
    and filtering predictors.

    Args:
        summary_df (pd.DataFrame): The summary table as a DataFrame.
        predictor_rename_dict (Optional[Dict[str, str]], optional): A
            dictionary to rename the predictors in the summary table. Defaults
            to `None`.
        exclude_predictors (Optional[List[str]], optional): A list of
            predictors to exclude from the summary table. Defaults to `[]`.
        column_rename_dict (Optional[Dict[str, str]], optional): A dictionary
            to rename the summary table columns. Defaults to `None`.
        round_dict (Optional[Dict[str, int]], optional): A dictionary to
            set the rounding precision for each column. Defaults to `None`.

    Returns:
        pd.DataFrame: The processed summary table DataFrame.
    """

    # Rename the columns
    if column_rename_dict is not None:
        summary_df = summary_df.rename(columns=column_rename_dict)

    # Rename the predictors
    if predictor_rename_dict is None:
        summary_df.index = summary_df.index.str.replace("__", " ")
        summary_df.index = summary_df.index.str.replace("_", " ")
        summary_df.index = summary_df.index.str.title()
    else:
        summary_df = summary_df.rename(index=predictor_rename_dict)

    # Drop the intercept row
    summary_df = summary_df.drop(index="Intercept", errors="ignore")

    # Drop any excluded predictors
    if exclude_predictors:
        summary_df = summary_df.drop(index=exclude_predictors, errors="ignore")

    return summary_df