{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:6YETJUN5ELIXCCUFJKB6DORLLZ","short_pith_number":"pith:6YETJUN5","schema_version":"1.0","canonical_sha256":"f60934d1bd22d1710a854a83e1ba2b5e6737dc1ce190bd9a4bcce587715752dd","source":{"kind":"arxiv","id":"2309.14525","version":1},"attestation_state":"computed","paper":{"title":"Aligning Large Multimodal Models with Factually Augmented RLHF","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Factually augmented RLHF aligns large multimodal models to cut hallucinations and reach 94 percent of GPT-4 performance.","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Chuang Gan, Chunyuan Li, Haotian Liu, Kurt Keutzer, Liang-Yan Gui, Shengcao Cao, Sheng Shen, Trevor Darrell, Yikang Shen, Yiming Yang, Yu-Xiong Wang, Zhiqing Sun","submitted_at":"2023-09-25T20:59:33Z","abstract_excerpt":"Large Multimodal Models (LMM) are built across modalities and the misalignment between two modalities can result in \"hallucination\", generating textual outputs that are not grounded by the multimodal information in context. To address the multimodal misalignment issue, we adapt the Reinforcement Learning from Human Feedback (RLHF) from the text domain to the task of vision-language alignment, where human annotators are asked to compare two responses and pinpoint the more hallucinated one, and the vision-language model is trained to maximize the simulated human rewards. We propose a new alignme"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2309.14525","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-09-25T20:59:33Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"b87d6710b9a70b3477c1ded6d7d8d8fa6c0ab18f6bcb0fae4771d23b5540c209","abstract_canon_sha256":"4a31ccd234c5dd37a44709d78049d1b291502e92a679cc0c02c73eb12bf35fdf"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:50.660903Z","signature_b64":"0NMp7G6UfepNO22Tt5UQ0QamUC8ajQmGsBxItRm/IvBgbWBvZcP0gxWvzSZrJ2d6kpHEv9zOVySlH4B0+PwWCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f60934d1bd22d1710a854a83e1ba2b5e6737dc1ce190bd9a4bcce587715752dd","last_reissued_at":"2026-05-17T23:38:50.660329Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:50.660329Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Aligning Large Multimodal Models with Factually Augmented RLHF","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Factually augmented RLHF aligns large multimodal models to cut hallucinations and reach 94 percent of GPT-4 performance.","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Chuang Gan, Chunyuan Li, Haotian Liu, Kurt Keutzer, Liang-Yan Gui, Shengcao Cao, Sheng Shen, Trevor Darrell, Yikang Shen, Yiming Yang, Yu-Xiong Wang, Zhiqing Sun","submitted_at":"2023-09-25T20:59:33Z","abstract_excerpt":"Large Multimodal Models (LMM) are built across modalities and the misalignment between two modalities can result in \"hallucination\", generating textual outputs that are not grounded by the multimodal information in context. To address the multimodal misalignment issue, we adapt the Reinforcement Learning from Human Feedback (RLHF) from the text domain to the task of vision-language alignment, where human annotators are asked to compare two responses and pinpoint the more hallucinated one, and the vision-language model is trained to maximize the simulated human rewards. We propose a new alignme"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"As the first LMM trained with RLHF, our approach achieves remarkable improvement on the LLaVA-Bench dataset with the 94% performance level of the text-only GPT-4 (while previous best methods can only achieve the 87% level), and an improvement by 60% on MMHAL-BENCH over other baselines.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That augmenting the reward model with image captions and ground-truth options reliably prevents reward hacking without introducing new biases or reducing generalization on open-ended questions.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Factually Augmented RLHF aligns large multimodal models to reduce hallucinations, reaching 94% of GPT-4 on LLaVA-Bench and 60% improvement on the new MMHAL-BENCH.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Factually augmented RLHF aligns large multimodal models to cut hallucinations and reach 94 percent of GPT-4 performance.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"e5c1f2b6b7d4778593b652438bf504bc8c8a1b43ed0ec404843e1e47a6586521"},"source":{"id":"2309.14525","kind":"arxiv","version":1},"verdict":{"id":"b20010b3-5fe3-4bb4-a581-74bdcc006219","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T17:51:53.974531Z","strongest_claim":"As the first LMM trained with RLHF, our approach achieves remarkable improvement on the LLaVA-Bench dataset with the 94% performance level of the text-only GPT-4 (while previous best methods can only achieve the 87% level), and an improvement by 60% on MMHAL-BENCH over other baselines.","one_line_summary":"Factually Augmented RLHF aligns large multimodal models to reduce hallucinations, reaching 94% of GPT-4 on LLaVA-Bench and 60% improvement on the new MMHAL-BENCH.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That augmenting the reward model with image captions and ground-truth options reliably prevents reward hacking without introducing new biases or reducing generalization on open-ended questions.","pith_extraction_headline":"Factually augmented RLHF aligns large multimodal models to cut hallucinations and reach 94 percent of GPT-4 performance."},"references":{"count":40,"sample":[{"doi":"","year":null,"title":"PaLM 2 Technical Report","work_id":"905ee9a7-ea61-4a94-bd62-2600cbe3e315","ref_index":1,"cited_arxiv_id":"2305.10403","is_internal_anchor":true},{"doi":"","year":null,"title":"OpenFlamingo: An Open-Source Framework for Training Large Autoregressive Vision-Language Models","work_id":"87bfa84a-e663-4165-806f-93ef439d88d0","ref_index":2,"cited_arxiv_id":"2308.01390","is_internal_anchor":true},{"doi":"","year":null,"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","ref_index":3,"cited_arxiv_id":"2308.12966","is_internal_anchor":true},{"doi":"","year":null,"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","ref_index":4,"cited_arxiv_id":"2204.05862","is_internal_anchor":true},{"doi":"","year":1901,"title":"Language models are few-shot learners","work_id":"677093e0-2019-45af-8c52-d9b33dec7e3d","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":40,"snapshot_sha256":"5e3d386cb62190aef6fb62e08c10bb0cca0ee129a80e1414dac24fef753aceca","internal_anchors":27},"formal_canon":{"evidence_count":2,"snapshot_sha256":"8f482e2c6a72c8b21643ab6aa20b3d3f2158fe9749be45a437ac59cf74e60975"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2309.14525","created_at":"2026-05-17T23:38:50.660426+00:00"},{"alias_kind":"arxiv_version","alias_value":"2309.14525v1","created_at":"2026-05-17T23:38:50.660426+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2309.14525","created_at":"2026-05-17T23:38:50.660426+00:00"},{"alias_kind":"pith_short_12","alias_value":"6YETJUN5ELIX","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_16","alias_value":"6YETJUN5ELIXCCUF","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_8","alias_value":"6YETJUN5","created_at":"2026-05-18T12:33:33.725879+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":39,"internal_anchor_count":39,"sample":[{"citing_arxiv_id":"2605.23281","citing_title":"DepthAgent: Towards Better Universal Depth Estimation via Sample-wise Expert Selection","ref_index":60,"is_internal_anchor":true},{"citing_arxiv_id":"2410.04509","citing_title":"ErrorRadar: Benchmarking Complex Mathematical Reasoning of Multimodal Large Language Models Via Error Detection","ref_index":58,"is_internal_anchor":true},{"citing_arxiv_id":"2411.15594","citing_title":"A Survey on LLM-as-a-Judge","ref_index":139,"is_internal_anchor":true},{"citing_arxiv_id":"2602.01970","citing_title":"Small Generalizable Prompt Predictive Models Can Steer Efficient RL Post-Training of Large Reasoning Models","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2603.14184","citing_title":"Deeper Thought, Weaker Aim: Understanding and Mitigating Perceptual Impairment during Reasoning in Multimodal Large Language Models","ref_index":22,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20278","citing_title":"ClaimDiff-RL: Fine-Grained Caption Reinforcement Learning through Visual Claim Comparison","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21300","citing_title":"Reducing Object Hallucination in LVLMs via Emphasizing Image-negative Tokens","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15300","citing_title":"Deep Pre-Alignment for VLMs","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2505.18600","citing_title":"Chain-of-Zoom: Extreme Super-Resolution via Scale Autoregression and Preference Alignment","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2505.20214","citing_title":"When Slower Isn't Truer: Inverse Scaling Law of Truthfulness in Multimodal Reasoning","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2506.04565","citing_title":"From Standalone LLMs to Integrated Intelligence: A Survey of Compound Al Systems","ref_index":168,"is_internal_anchor":true},{"citing_arxiv_id":"2506.06856","citing_title":"Vision-EKIPL: External Knowledge-Infused Policy Learning for Visual Reasoning","ref_index":50,"is_internal_anchor":true},{"citing_arxiv_id":"2507.21584","citing_title":"TARS: MinMax Token-Adaptive Preference Strategy for Hallucination Reduction in MLLMs","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2408.00724","citing_title":"Inference Scaling Laws: An Empirical Analysis of Compute-Optimal Inference for Problem-Solving with Language Models","ref_index":259,"is_internal_anchor":true},{"citing_arxiv_id":"2311.04257","citing_title":"mPLUG-Owl2: Revolutionizing Multi-modal Large Language Model with Modality Collaboration","ref_index":56,"is_internal_anchor":true},{"citing_arxiv_id":"2402.11411","citing_title":"Aligning Modalities in Vision Large Language Models via Preference Fine-tuning","ref_index":172,"is_internal_anchor":true},{"citing_arxiv_id":"2503.07536","citing_title":"LMM-R1: Empowering 3B LMMs with Strong Reasoning Abilities Through Two-Stage Rule-Based RL","ref_index":66,"is_internal_anchor":true},{"citing_arxiv_id":"2601.06993","citing_title":"Can Textual Reasoning Improve the Performance of MLLMs on Fine-grained Visual Classification?","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2411.10442","citing_title":"Enhancing the Reasoning Ability of Multimodal Large Language Models via Mixed Preference Optimization","ref_index":88,"is_internal_anchor":true},{"citing_arxiv_id":"2306.13549","citing_title":"A Survey on Multimodal Large Language Models","ref_index":114,"is_internal_anchor":true},{"citing_arxiv_id":"2509.18154","citing_title":"MiniCPM-V 4.5: Cooking Efficient MLLMs via Architecture, Data, and Training Recipe","ref_index":61,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13156","citing_title":"Dual-Pathway Circuits of Object Hallucination in Vision-Language Models","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13173","citing_title":"OxyEcomBench: Benchmarking Multimodal Foundation Models across E-Commerce Ecosystems","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2503.05236","citing_title":"Unified Reward Model for Multimodal Understanding and Generation","ref_index":63,"is_internal_anchor":true},{"citing_arxiv_id":"2503.01785","citing_title":"Visual-RFT: Visual Reinforcement Fine-Tuning","ref_index":35,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/6YETJUN5ELIXCCUFJKB6DORLLZ","json":"https://pith.science/pith/6YETJUN5ELIXCCUFJKB6DORLLZ.json","graph_json":"https://pith.science/api/pith-number/6YETJUN5ELIXCCUFJKB6DORLLZ/graph.json","events_json":"https://pith.science/api/pith-number/6YETJUN5ELIXCCUFJKB6DORLLZ/events.json","paper":"https://pith.science/paper/6YETJUN5"},"agent_actions":{"view_html":"https://pith.science/pith/6YETJUN5ELIXCCUFJKB6DORLLZ","download_json":"https://pith.science/pith/6YETJUN5ELIXCCUFJKB6DORLLZ.json","view_paper":"https://pith.science/paper/6YETJUN5","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2309.14525&json=true","fetch_graph":"https://pith.science/api/pith-number/6YETJUN5ELIXCCUFJKB6DORLLZ/graph.json","fetch_events":"https://pith.science/api/pith-number/6YETJUN5ELIXCCUFJKB6DORLLZ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/6YETJUN5ELIXCCUFJKB6DORLLZ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/6YETJUN5ELIXCCUFJKB6DORLLZ/action/storage_attestation","attest_author":"https://pith.science/pith/6YETJUN5ELIXCCUFJKB6DORLLZ/action/author_attestation","sign_citation":"https://pith.science/pith/6YETJUN5ELIXCCUFJKB6DORLLZ/action/citation_signature","submit_replication":"https://pith.science/pith/6YETJUN5ELIXCCUFJKB6DORLLZ/action/replication_record"}},"created_at":"2026-05-17T23:38:50.660426+00:00","updated_at":"2026-05-17T23:38:50.660426+00:00"}