Learn by Directing AI

evaluation-suite.py

pyevaluation-suite.py
"""
Evaluation suite for the MedConnect matching model.
Computes quality metrics and checks against configurable thresholds.
"""

import json
import sys

# Eval thresholds -- adjust these based on the project requirements
THRESHOLDS = {
    "recall": 0.55,
    "precision": 0.50,
    "f1": 0.55,
    "max_fairness_gap": 0.15,
}


def load_model(model_path="model/matching_model.pkl"):
    """Load the trained matching model."""
    import pickle
    with open(model_path, "rb") as f:
        model = pickle.load(f)
    return model


def load_test_data(data_path="data/test_data.csv"):
    """Load the test dataset for evaluation."""
    import csv
    rows = []
    with open(data_path, "r") as f:
        reader = csv.DictReader(f)
        for row in reader:
            rows.append(row)
    return rows


def compute_metrics(model, data):
    """
    Compute evaluation metrics for the matching model.

    Returns a dict with:
    - accuracy: overall accuracy
    - recall: overall recall for positive class
    - precision: overall precision for positive class
    - f1: overall F1 score
    - per_region_recall: dict mapping region -> recall
    - fairness_gap: max difference in recall across regions
    """
    # Placeholder implementation -- the student configures this
    # with the actual model evaluation logic
    metrics = {
        "accuracy": 0.0,
        "recall": 0.0,
        "precision": 0.0,
        "f1": 0.0,
        "per_region_recall": {},
        "fairness_gap": 0.0,
    }
    return metrics


def check_thresholds(metrics, thresholds):
    """
    Compare computed metrics against thresholds.

    Returns:
        (passed: bool, failures: list of str)
    """
    failures = []

    if metrics["recall"] < thresholds["recall"]:
        failures.append(
            f"FAIL: recall {metrics['recall']:.3f} below threshold {thresholds['recall']}"
        )

    if metrics["precision"] < thresholds["precision"]:
        failures.append(
            f"FAIL: precision {metrics['precision']:.3f} below threshold {thresholds['precision']}"
        )

    if metrics["f1"] < thresholds["f1"]:
        failures.append(
            f"FAIL: f1 {metrics['f1']:.3f} below threshold {thresholds['f1']}"
        )

    if metrics["fairness_gap"] > thresholds["max_fairness_gap"]:
        failures.append(
            f"FAIL: fairness_gap {metrics['fairness_gap']:.3f} exceeds threshold {thresholds['max_fairness_gap']}"
        )

    passed = len(failures) == 0
    return passed, failures


def main():
    """Run the full evaluation suite."""
    print("=" * 60)
    print("MedConnect Matching Model -- Evaluation Suite")
    print("=" * 60)

    model = load_model()
    data = load_test_data()
    metrics = compute_metrics(model, data)

    print("\nMetrics:")
    print(f"  accuracy:     {metrics['accuracy']:.3f}")
    print(f"  recall:       {metrics['recall']:.3f}")
    print(f"  precision:    {metrics['precision']:.3f}")
    print(f"  f1:           {metrics['f1']:.3f}")
    print(f"  fairness_gap: {metrics['fairness_gap']:.3f}")

    if metrics["per_region_recall"]:
        print("\nPer-region recall:")
        for region, recall in sorted(metrics["per_region_recall"].items()):
            print(f"  {region}: {recall:.3f}")

    print("\nThresholds:")
    for metric, threshold in THRESHOLDS.items():
        print(f"  {metric}: {threshold}")

    passed, failures = check_thresholds(metrics, THRESHOLDS)

    print("\n" + "=" * 60)
    if passed:
        print("RESULT: PASS -- All metrics above threshold.")
    else:
        print("RESULT: FAIL -- Blocking deployment.")
        for failure in failures:
            print(f"  {failure}")
        sys.exit(1)

    print("=" * 60)


if __name__ == "__main__":
    main()