{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:KCYBYGFNDCFVCHIZJQNV3ZXWCY","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"5b402e995dbe6b11df7d0c941b97055f5f558cd0f2637be6ded1421aced958cc","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-03-12T14:34:08Z","title_canon_sha256":"e21d7c1c1d1611ef27dee51ddccca28d890122978ef35588b08dc0edca8a9566"},"schema_version":"1.0","source":{"id":"2403.07691","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2403.07691","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"2403.07691v2","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2403.07691","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"KCYBYGFNDCFV","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"KCYBYGFNDCFVCHIZ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"KCYBYGFN","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:435e1b27e8a8c0a912c05309d2221288de0f60a50b2ec3e1fdcca99988216bf7","target":"graph","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"fine-tuning Phi-2 (2.7B), Llama-2 (7B), and Mistral (7B) with ORPO on the UltraFeedback alone surpasses the performance of state-of-the-art language models with more than 7B and 13B parameters: achieving up to 12.20% on AlpacaEval 2.0, 66.19% on IFEval (instruction-level loose), and 7.32 in MT-Bench."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the odds ratio is a sensible choice for contrasting favored and disfavored generation styles during supervised fine-tuning, and that a minor penalty for disfavored responses is sufficient to achieve preference alignment."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"ORPO performs preference alignment during supervised fine-tuning via a monolithic odds ratio penalty, allowing 7B models to outperform larger state-of-the-art models on alignment benchmarks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A simple odds-ratio penalty during supervised fine-tuning suffices to align language models without any reference model or separate alignment stage."}],"snapshot_sha256":"d661f489a94e7e2c94cc4ed0da8234e957690defda7ae5e63a27b6de6a92c529"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"858eefa9aa66113a655c5e987f23fbeb6d0da6c451fd7ee304de8e5280ca650c"},"paper":{"abstract_excerpt":"While recent preference alignment algorithms for language models have demonstrated promising results, supervised fine-tuning (SFT) remains imperative for achieving successful convergence. In this paper, we study the crucial role of SFT within the context of preference alignment, emphasizing that a minor penalty for the disfavored generation style is sufficient for preference-aligned SFT. Building on this foundation, we introduce a straightforward and innovative reference model-free monolithic odds ratio preference optimization algorithm, ORPO, eliminating the necessity for an additional prefer","authors_text":"James Thorne, Jiwoo Hong, Noah Lee","cross_cats":["cs.AI"],"headline":"A simple odds-ratio penalty during supervised fine-tuning suffices to align language models without any reference model or separate alignment stage.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-03-12T14:34:08Z","title":"ORPO: Monolithic Preference Optimization without Reference Model"},"references":{"count":291,"internal_anchors":26,"resolved_work":291,"sample":[{"cited_arxiv_id":"2311.16867","doi":"","is_internal_anchor":true,"ref_index":2,"title":"The Falcon Series of Open Language Models","work_id":"9ef058cb-28ba-4128-b9b7-a707f2fd36b3","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"arXiv preprint arXiv:2310.12036 , year=","work_id":"44673d8e-2cc2-4818-86d3-24bc812aa41c","year":2023},{"cited_arxiv_id":"2204.05862","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","year":2022},{"cited_arxiv_id":"2212.08073","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Constitutional AI: Harmlessness from AI Feedback","work_id":"faaaa4e0-2676-4fac-a0b4-99aef10d2095","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":6,"title":"Alvaro Bartolome, Gabriel Martin, and Daniel Vila. 2023. Notus. https://github.com/argilla-io/notus","work_id":"238e87d1-57dd-45fb-9e3a-a97ff2d8fb24","year":2023}],"snapshot_sha256":"afab2e8727813d7868992a9df4e83e86d4745a93b5ea7ce843d4f1dfb44da282"},"source":{"id":"2403.07691","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-16T09:29:41.816743Z","id":"cec109bb-4a38-4920-9044-a2afa5594641","model_set":{"reader":"grok-4.3"},"one_line_summary":"ORPO performs preference alignment during supervised fine-tuning via a monolithic odds ratio penalty, allowing 7B models to outperform larger state-of-the-art models on alignment benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A simple odds-ratio penalty during supervised fine-tuning suffices to align language models without any reference model or separate alignment stage.","strongest_claim":"fine-tuning Phi-2 (2.7B), Llama-2 (7B), and Mistral (7B) with ORPO on the UltraFeedback alone surpasses the performance of state-of-the-art language models with more than 7B and 13B parameters: achieving up to 12.20% on AlpacaEval 2.0, 66.19% on IFEval (instruction-level loose), and 7.32 in MT-Bench.","weakest_assumption":"That the odds ratio is a sensible choice for contrasting favored and disfavored generation styles during supervised fine-tuning, and that a minor penalty for disfavored responses is sufficient to achieve preference alignment."}},"verdict_id":"cec109bb-4a38-4920-9044-a2afa5594641"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:63038446c03108d0e67bb9a0156caeb131e2bdc57259a10e56955c9870071ade","target":"record","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"5b402e995dbe6b11df7d0c941b97055f5f558cd0f2637be6ded1421aced958cc","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-03-12T14:34:08Z","title_canon_sha256":"e21d7c1c1d1611ef27dee51ddccca28d890122978ef35588b08dc0edca8a9566"},"schema_version":"1.0","source":{"id":"2403.07691","kind":"arxiv","version":2}},"canonical_sha256":"50b01c18ad188b511d194c1b5de6f6162b717b773d12372a2d1c31efb8ca5f37","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"50b01c18ad188b511d194c1b5de6f6162b717b773d12372a2d1c31efb8ca5f37","first_computed_at":"2026-05-17T23:38:48.340253Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:48.340253Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"8k8ajOKSU4JGTaCaE87R4P9y/a/q4fAsxrdBzpOSvn1pSRETpi8Wrs/MxEzHCmXVO3Fuj9zXoFT8dCCYFauFBA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:48.340862Z","signed_message":"canonical_sha256_bytes"},"source_id":"2403.07691","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:63038446c03108d0e67bb9a0156caeb131e2bdc57259a10e56955c9870071ade","sha256:435e1b27e8a8c0a912c05309d2221288de0f60a50b2ec3e1fdcca99988216bf7"],"state_sha256":"12d79f96a8aa3ed15d9f926e9bf9e300d827dfd3281dc3ffdaad4fa7ead99b09"}