01
Fetch labelled test set from OSF
action
download
config (3 keys)
{
"expected_columns": [
"chunk_id",
"text",
"human_label",
"human_score"
],
"target": "./data/beige_book_test_200.csv",
"url": "https://osf.io/xq35t/"
}Reproduce the headline comparison from Smith & Lambert (2026): score the 200-chunk human-labelled test set with each model, compute accuracy / macro F1 / MCC, then check correlations against GDP growth, unemployment, and CPI. Assumes you have already trained or downloaded BeigeSage and have API access / local weights for the comparison LLMs. For end-to-end training-from-scratch, see the `reproduce_beigesage` playbook.
{
"expected_columns": [
"chunk_id",
"text",
"human_label",
"human_score"
],
"target": "./data/beige_book_test_200.csv",
"url": "https://osf.io/xq35t/"
}{
"class_distribution_check": {
"mixed": 0.415,
"negative": 0.195,
"positive": 0.39,
"tolerance": 0.05
},
"constructs": [
"beige_book_sentiment"
],
"max_missing_pct": 0,
"min_observations": 200
}{
"batch_size": 16,
"framework": "huggingface_transformers",
"max_length": 256,
"model": "beigesage"
}{
"models": [
"gpt-4o",
"llama-3.1-8b",
"phi3-3.8b",
"gemma2-9b"
],
"output_format": "json",
"parse_key": "label",
"prompt_template_ref": "zero_shot_prompt",
"retry_on_invalid_json": 3
}{
"bin_thresholds": {
"negative_below": -0.2,
"positive_above": 0.2
},
"method": "vader_compound"
}{
"gold": "human_label",
"metrics": [
"accuracy",
"macro_f1",
"mcc",
"per_class_precision",
"per_class_recall"
],
"predictions": [
"beigesage_prediction",
"gpt4o_prediction",
"llama_prediction",
"phi3_prediction",
"gemma2_prediction",
"vader_prediction"
]
}{
"beigesage": {
"accuracy": 0.71,
"macro_f1": 0.71,
"mcc": 0.55
},
"gemma2": {
"accuracy": 0.59,
"macro_f1": 0.55,
"mcc": 0.41
},
"gpt_4o": {
"accuracy": 0.63,
"macro_f1": 0.62,
"mcc": 0.48
},
"llama_3_1_8b": {
"accuracy": 0.68,
"macro_f1": 0.67,
"mcc": 0.52
},
"phi3": {
"accuracy": 0.62,
"macro_f1": 0.6,
"mcc": 0.45
},
"vader": {
"accuracy": 0.49,
"macro_f1": 0.43,
"mcc": 0.27
}
}{
"gold": "human_label",
"predictions": [
"beigesage_prediction",
"gpt4o_prediction",
"llama_prediction",
"phi3_prediction",
"gemma2_prediction",
"vader_prediction"
]
}{
"mixed_share_beigesage": 0.405,
"mixed_share_human": 0.415,
"mixed_share_non_finetuned_range": [
0.655,
0.785
]
}{
"aggregate_to_period": true,
"input": "./data/beige_books_1970_2023_chunks.csv",
"model": "beigesage",
"output": "./data/beigesage_full_corpus.csv",
"score_normalization": "categorical_to_signed_continuous"
}{
"data_sources": {
"consumer_price_index": "FRED CPIAUCSL",
"gdp_growth_rate": "Brave-Butters-Kelley monthly real GDP growth (Chicago Fed)",
"unemployment_rate": "FRED UNRATE"
},
"variables": [
"beige_book_sentiment",
"gdp_growth_rate",
"unemployment_rate",
"consumer_price_index"
]
}{
"beige_book_sentiment↔consumer_price_index": {
"direction": "positive",
"n": 470,
"p": 0.001,
"r": 0.42
},
"beige_book_sentiment↔gdp_growth_rate": {
"direction": "positive",
"n": 470,
"p": 0.001,
"r": 0.29
},
"beige_book_sentiment↔unemployment_rate": {
"direction": "negative",
"n": 470,
"p": 0.001,
"r": -0.24
}
}{
"dependents": [
"gdp_growth_rate",
"unemployment_rate",
"consumer_price_index"
],
"horizons": [
1,
2,
3
],
"lags": 3,
"predictor": "beige_book_sentiment"
}{
"consumer_price_index": {
"delta_r2_h3": 0.003
},
"gdp_growth_rate": {
"baseline_r2_h1": 0.451,
"delta_r2_h1": 0.003,
"delta_r2_h2": 0.022,
"delta_r2_h3": 0.032
},
"unemployment_rate": {
"delta_r2_h3": 0.035
}
}