01
Pull replication data from OSF
action
download
config (2 keys)
{
"targets": [
"./data/beige_book_corpus_1970_2023.txt",
"./data/labelled_chunks_1000.csv"
],
"url": "https://osf.io/xq35t/"
}End-to-end reproduction of the BeigeSage model: pull the base RoBERTa weights from HuggingFace, MLM-pretrain on the 1970-2023 Beige Book corpus, supervise- fine-tune on the 800-chunk labelled training set, and evaluate on the 200-chunk held-out test set. Recommended environment: Google Colab with a T4 or better GPU (matches authors' setup).
{
"targets": [
"./data/beige_book_corpus_1970_2023.txt",
"./data/labelled_chunks_1000.csv"
],
"url": "https://osf.io/xq35t/"
}{
"max_length": 256,
"output": "./data/bb_chunks_256.arrow",
"stride": 0,
"tokenizer": "FacebookAI/roberta-large"
}{
"base_model": "FacebookAI/roberta-large",
"expected_runtime_minutes": 39,
"mlm_probability": 0.15,
"objective": "masked_language_modeling",
"training_args": {
"learning_rate": "5e-5",
"num_train_epochs": 1,
"output_dir": "./models/beigesage-mlm",
"per_device_train_batch_size": 8,
"save_strategy": "epoch"
},
"training_corpus": "./data/bb_chunks_256.arrow"
}{
"input": "./data/labelled_chunks_1000.csv",
"outputs": {
"test": "./data/test_200.csv",
"train": "./data/train_800.csv"
},
"score_to_label_rule": {
"mixed": "-0.2 \u003c= score \u003c= 0.2",
"negative": "score \u003c -0.2",
"positive": "score \u003e 0.2"
},
"train_test_split": {
"random_seed": 42,
"stratify_by": "label",
"test_n": 200,
"train_n": 800
}
}{
"base_model": "./models/beigesage-mlm",
"expected_runtime_minutes": 314,
"label_map": {
"mixed": 1,
"negative": 0,
"positive": 2
},
"num_labels": 3,
"task": "sequence_classification",
"training_args": {
"evaluation_strategy": "epoch",
"learning_rate": "2e-5",
"load_best_model_at_end": true,
"metric_for_best_model": "f1_macro",
"num_train_epochs": 3,
"output_dir": "./models/beigesage",
"per_device_train_batch_size": 4,
"save_strategy": "epoch",
"weight_decay": 0.01
},
"training_data": "./data/train_800.csv"
}{
"expected_runtime_seconds": 95,
"input": "./data/test_200.csv",
"max_length": 256,
"model": "./models/beigesage",
"output": "./outputs/beigesage_predictions.csv"
}{
"gold_field": "human_label",
"metrics": [
"accuracy",
"macro_f1",
"mcc",
"confusion_matrix",
"per_class_precision_recall_f1"
],
"pred_field": "beigesage_prediction"
}{
"accuracy": 0.71,
"macro_f1": 0.71,
"mcc": 0.55,
"per_class_recall": {
"mixed": 0.65,
"negative": 0.64,
"positive": 0.82
}
}{
"include": [
"model weights",
"tokenizer",
"label_map",
"training README"
],
"private": false,
"repo_id": "\u003cyour-username\u003e/beigesage"
}