Skip to content

API Reference: Analysis Functions

This page documents the analysis and inspection functions in Diet Pandas.

Analysis Functions

analyze()

Analyze a DataFrame and return optimization recommendations without modifying it.

dietpandas.analysis.analyze(df, aggressive=False, categorical_threshold=0.5, sparse_threshold=0.9, optimize_datetimes=True, optimize_sparse_cols=False, optimize_bools=True)

Analyze DataFrame and return optimization recommendations without modifying it.

This function performs a "dry run" of the optimization process, providing insights into potential memory savings and recommended data type changes without actually modifying the DataFrame.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame to analyze

required
aggressive bool

If True, simulate aggressive optimization (float16)

False
categorical_threshold float

Threshold for converting objects to categories

0.5
sparse_threshold float

Threshold for converting to sparse format

0.9
optimize_datetimes bool

If True, include datetime optimization analysis

True
optimize_sparse_cols bool

If True, check for sparse optimization opportunities

False
optimize_bools bool

If True, check for boolean optimization opportunities

True

Returns:

Type Description
DataFrame

DataFrame with columns:

DataFrame
  • column: Column name
DataFrame
  • current_dtype: Current data type
DataFrame
  • recommended_dtype: Recommended data type after optimization
DataFrame
  • current_memory_mb: Current memory usage in MB
DataFrame
  • optimized_memory_mb: Estimated memory after optimization in MB
DataFrame
  • savings_mb: Memory savings in MB
DataFrame
  • savings_percent: Percent reduction in memory
DataFrame
  • reasoning: Explanation of the recommendation

Examples:

>>> df = pd.DataFrame({'age': [25, 30, 35], 'name': ['A', 'B', 'A']})
>>> analysis = analyze(df)
>>> print(analysis)
Source code in src/dietpandas/analysis.py
def analyze(
    df: pd.DataFrame,
    aggressive: bool = False,
    categorical_threshold: float = 0.5,
    sparse_threshold: float = 0.9,
    optimize_datetimes: bool = True,
    optimize_sparse_cols: bool = False,
    optimize_bools: bool = True,
) -> pd.DataFrame:
    """
    Analyze DataFrame and return optimization recommendations without modifying it.

    This function performs a "dry run" of the optimization process, providing
    insights into potential memory savings and recommended data type changes
    without actually modifying the DataFrame.

    Args:
        df: Input DataFrame to analyze
        aggressive: If True, simulate aggressive optimization (float16)
        categorical_threshold: Threshold for converting objects to categories
        sparse_threshold: Threshold for converting to sparse format
        optimize_datetimes: If True, include datetime optimization analysis
        optimize_sparse_cols: If True, check for sparse optimization opportunities
        optimize_bools: If True, check for boolean optimization opportunities

    Returns:
        DataFrame with columns:
        - column: Column name
        - current_dtype: Current data type
        - recommended_dtype: Recommended data type after optimization
        - current_memory_mb: Current memory usage in MB
        - optimized_memory_mb: Estimated memory after optimization in MB
        - savings_mb: Memory savings in MB
        - savings_percent: Percent reduction in memory
        - reasoning: Explanation of the recommendation

    Examples:
        >>> df = pd.DataFrame({'age': [25, 30, 35], 'name': ['A', 'B', 'A']})
        >>> analysis = analyze(df)
        >>> print(analysis)
    """
    recommendations = []

    for col in df.columns:
        dtype = df[col].dtype
        series = df[col]

        # Skip all-NaN columns
        if series.isna().all():
            continue

        current_memory = series.memory_usage(deep=True)
        current_dtype_str = str(dtype)
        recommended_dtype_str = current_dtype_str
        optimized_memory = current_memory
        reasoning = "No optimization needed"

        # Try boolean optimization first
        if optimize_bools and (np.issubdtype(dtype, np.integer) or dtype == "object"):
            try:
                optimized = optimize_bool(series)
                if optimized.dtype in ["boolean", bool]:
                    recommended_dtype_str = "boolean"
                    optimized_memory = optimized.memory_usage(deep=True)
                    reasoning = (
                        "Boolean-like values detected (0/1 or yes/no). "
                        "Convert to boolean for 87.5% memory reduction."
                    )
                    recommendations.append(
                        _create_recommendation(
                            col,
                            current_dtype_str,
                            recommended_dtype_str,
                            current_memory,
                            optimized_memory,
                            reasoning,
                        )
                    )
                    continue
            except Exception:
                pass

        # Try integer optimization
        if np.issubdtype(dtype, np.integer):
            try:
                optimized = optimize_int(series)
                if optimized.dtype != dtype:
                    recommended_dtype_str = str(optimized.dtype)
                    optimized_memory = optimized.memory_usage(deep=True)
                    reasoning = (
                        f"Integer values fit in smaller type. "
                        f"Range: [{series.min()}, {series.max()}]"
                    )
                    if optimize_sparse_cols:
                        from .core import optimize_sparse

                        sparse_optimized = optimize_sparse(optimized, sparse_threshold)
                        if isinstance(sparse_optimized.dtype, pd.SparseDtype):
                            recommended_dtype_str = str(sparse_optimized.dtype)
                            optimized_memory = sparse_optimized.memory_usage(deep=True)
                        sparse_pct = sparse_threshold * 100
                        reasoning += (
                            f" Sparse format recommended " f"({sparse_pct}% values are identical)."
                        )
            except Exception:
                pass

        # Try float optimization
        elif np.issubdtype(dtype, np.floating):
            try:
                optimized = optimize_float(series, aggressive=aggressive)
                if optimized.dtype != dtype:
                    recommended_dtype_str = str(optimized.dtype)
                    optimized_memory = optimized.memory_usage(deep=True)
                    if aggressive:
                        reasoning = (
                            "Float64 → float16 (aggressive mode). "
                            "⚠️  May lose precision for large/small values."
                        )
                    else:
                        reasoning = (
                            "Float64 → float32 conversion. "
                            "Safe for most ML/scientific applications."
                        )
                    if optimize_sparse_cols:
                        from .core import optimize_sparse

                        sparse_optimized = optimize_sparse(optimized, sparse_threshold)
                        if isinstance(sparse_optimized.dtype, pd.SparseDtype):
                            recommended_dtype_str = str(sparse_optimized.dtype)
                            optimized_memory = sparse_optimized.memory_usage(deep=True)
                        reasoning += " Sparse format also recommended."
            except Exception:
                pass

        # Try datetime optimization
        elif pd.api.types.is_datetime64_any_dtype(dtype) and optimize_datetimes:
            # Datetime optimization typically doesn't change dtype significantly
            reasoning = "Datetime column (already efficient)"

        # Try object/string optimization
        elif dtype == "object":
            try:
                unique_ratio = series.nunique() / len(series) if len(series) > 0 else 1
                if unique_ratio < categorical_threshold:
                    optimized = optimize_obj(series, categorical_threshold)
                    if optimized.dtype.name == "category":
                        recommended_dtype_str = "category"
                        optimized_memory = optimized.memory_usage(deep=True)
                        reasoning = (
                            f"Low cardinality ({unique_ratio:.1%} unique values). "
                            f"Convert to category for memory savings."
                        )
                else:
                    reasoning = (
                        f"High cardinality ({unique_ratio:.1%} unique). "
                        f"⚠️  Not suitable for category conversion."
                    )
            except Exception:
                pass

        recommendations.append(
            _create_recommendation(
                col,
                current_dtype_str,
                recommended_dtype_str,
                current_memory,
                optimized_memory,
                reasoning,
            )
        )

    # Create DataFrame from recommendations
    if not recommendations:
        return pd.DataFrame(
            columns=[
                "column",
                "current_dtype",
                "recommended_dtype",
                "current_memory_mb",
                "optimized_memory_mb",
                "savings_mb",
                "savings_percent",
                "reasoning",
            ]
        )

    result = pd.DataFrame(recommendations)
    # Sort by savings (descending)
    result = result.sort_values("savings_mb", ascending=False).reset_index(drop=True)

    return result

Example:

import pandas as pd
import dietpandas as dp

df = pd.DataFrame({
    'id': range(1000),
    'amount': [1.1, 2.2, 3.3] * 333 + [1.1],
    'category': ['A', 'B', 'C'] * 333 + ['A']
})

# Get detailed analysis
analysis_df = dp.analyze(df)
print(analysis_df)
#      column current_dtype recommended_dtype  current_memory_mb  optimized_memory_mb  savings_mb  savings_percent                  reasoning
# 0        id         int64             uint16               0.008                0.002       0.006            75.0    Integer range fits in uint16
# 1    amount       float64            float32               0.008                0.004       0.004            50.0      Standard float optimization
# 2  category        object           category               0.057                0.001       0.056            98.2  Low cardinality (3 unique values)

get_optimization_summary()

Get summary statistics from an analysis DataFrame.

dietpandas.analysis.get_optimization_summary(df, **kwargs)

Get a summary of optimization opportunities.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame to analyze

required
**kwargs

Additional arguments passed to analyze()

{}

Returns:

Type Description
dict

Dictionary with summary statistics:

dict
  • total_memory_mb: Current total memory usage
dict
  • optimized_memory_mb: Estimated memory after optimization
dict
  • total_savings_mb: Total memory savings
dict
  • total_savings_percent: Overall percent reduction
dict
  • optimizable_columns: Number of columns that can be optimized

Examples:

>>> df = pd.DataFrame({'a': [1, 2, 3], 'b': ['x', 'y', 'z']})
>>> summary = get_optimization_summary(df)
>>> print(f"Potential savings: {summary['total_savings_percent']:.1f}%")
Source code in src/dietpandas/analysis.py
def get_optimization_summary(df: pd.DataFrame, **kwargs) -> dict:
    """
    Get a summary of optimization opportunities.

    Args:
        df: Input DataFrame to analyze
        **kwargs: Additional arguments passed to analyze()

    Returns:
        Dictionary with summary statistics:
        - total_memory_mb: Current total memory usage
        - optimized_memory_mb: Estimated memory after optimization
        - total_savings_mb: Total memory savings
        - total_savings_percent: Overall percent reduction
        - optimizable_columns: Number of columns that can be optimized

    Examples:
        >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': ['x', 'y', 'z']})
        >>> summary = get_optimization_summary(df)
        >>> print(f"Potential savings: {summary['total_savings_percent']:.1f}%")
    """
    analysis = analyze(df, **kwargs)

    total_current = analysis["current_memory_mb"].sum()
    total_optimized = analysis["optimized_memory_mb"].sum()
    total_savings = analysis["savings_mb"].sum()
    total_savings_percent = (total_savings / total_current * 100) if total_current > 0 else 0
    optimizable_columns = (analysis["savings_mb"] > 0).sum()

    return {
        "total_memory_mb": total_current,
        "optimized_memory_mb": total_optimized,
        "total_savings_mb": total_savings,
        "total_savings_percent": total_savings_percent,
        "optimizable_columns": optimizable_columns,
        "total_columns": len(analysis),
    }

Example:

import pandas as pd
import dietpandas as dp

df = pd.DataFrame({
    'id': range(1000),
    'value': [1.5, 2.5, 3.5] * 333 + [1.5]
})

analysis = dp.analyze(df)
summary = dp.get_optimization_summary(analysis)

print(summary)
# {
#     'total_columns': 2,
#     'optimizable_columns': 2,
#     'current_memory_mb': 0.016,
#     'optimized_memory_mb': 0.006,
#     'total_savings_mb': 0.010,
#     'total_savings_percent': 62.5
# }

estimate_memory_reduction()

Quickly estimate potential memory reduction percentage.

dietpandas.analysis.estimate_memory_reduction(df, **kwargs)

Quick estimate of potential memory reduction percentage.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame to analyze

required
**kwargs

Additional arguments passed to analyze()

{}

Returns:

Type Description
float

Estimated memory reduction as a percentage (0-100)

Examples:

>>> df = pd.DataFrame({'year': [2020, 2021], 'val': [1.1, 2.2]})
>>> reduction = estimate_memory_reduction(df)
>>> print(f"Estimated reduction: {reduction:.1f}%")
Source code in src/dietpandas/analysis.py
def estimate_memory_reduction(df: pd.DataFrame, **kwargs) -> float:
    """
    Quick estimate of potential memory reduction percentage.

    Args:
        df: Input DataFrame to analyze
        **kwargs: Additional arguments passed to analyze()

    Returns:
        Estimated memory reduction as a percentage (0-100)

    Examples:
        >>> df = pd.DataFrame({'year': [2020, 2021], 'val': [1.1, 2.2]})
        >>> reduction = estimate_memory_reduction(df)
        >>> print(f"Estimated reduction: {reduction:.1f}%")
    """
    summary = get_optimization_summary(df, **kwargs)
    return summary["total_savings_percent"]

Example:

import pandas as pd
import dietpandas as dp

df = pd.DataFrame({
    'int_col': [1, 2, 3, 4, 5] * 200,
    'float_col': [1.1, 2.2, 3.3, 4.4, 5.5] * 200,
    'str_col': ['A', 'B', 'C', 'A', 'B'] * 200
})

# Quick estimate without detailed analysis
reduction = dp.estimate_memory_reduction(df)
print(f"Estimated reduction: {reduction:.1f}%")
# Estimated reduction: 78.3%

# Compare with full analysis
analysis = dp.analyze(df)
summary = dp.get_optimization_summary(analysis)
print(f"Actual reduction: {summary['total_savings_percent']:.1f}%")

Workflow Example

Analyze Before Optimizing

import pandas as pd
import dietpandas as dp

# Load your data
df = pd.read_csv("data.csv")

# 1. Quick estimate
print(f"Expected reduction: {dp.estimate_memory_reduction(df):.1f}%")

# 2. Detailed analysis
analysis = dp.analyze(df)
print(analysis)

# 3. Review summary
summary = dp.get_optimization_summary(analysis)
print(f"Total savings: {summary['total_savings_mb']:.2f} MB")
print(f"Reduction: {summary['total_savings_percent']:.1f}%")

# 4. Apply optimization
df_optimized = dp.diet(df)

Aggressive Mode Analysis

import pandas as pd
import dietpandas as dp

df = pd.DataFrame({
    'metric': [1.123456789] * 1000
})

# Compare normal vs aggressive mode
normal_analysis = dp.analyze(df, aggressive=False)
aggressive_analysis = dp.analyze(df, aggressive=True)

print("Normal mode:")
print(normal_analysis)
# float64 -> float32 (50% reduction)

print("\nAggressive mode:")
print(aggressive_analysis)
# float64 -> float16 (75% reduction, but possible precision loss)

See Also