{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:MLZKOIP6MHGCTDJRSGGMHSTZLE","short_pith_number":"pith:MLZKOIP6","schema_version":"1.0","canonical_sha256":"62f2a721fe61cc298d31918cc3ca795932f0413baad48cfb02a6db9a25f5e6d4","source":{"kind":"arxiv","id":"2604.13491","version":3},"attestation_state":"computed","paper":{"title":"FiRe: Fine-grained Multimodal Reasoning for Enhanced Image Generation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Decomposing text prompts into semantic units and verifying each via visual questions lets multimodal models refine generated images with targeted fixes.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Hyomin Kim, Jeeyoung Yun, Minjun Kim, Sungwoong Kim, Yerin Kim, Yongjin Kim, Yoonjin Oh, Yujung Heo","submitted_at":"2026-04-15T05:24:29Z","abstract_excerpt":"With the rapid progress of Multimodal Large Language Models (MLLMs), unified MLLMs that jointly perform image understanding and generation have advanced significantly. However, despite the inherent reasoning capabilities of unified MLLMs for self-reflection and self-refinement, their use in text-to-image generation remains largely underexplored. Meanwhile, existing multimodal reasoning-based image generation methods mostly rely on prompt augmentation or holistic image-text alignment judgments, without fine-grained reflection and refinement of detailed prompt attributes, leading to limited fine"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2604.13491","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-04-15T05:24:29Z","cross_cats_sorted":[],"title_canon_sha256":"61fb2bfc827015116a5c93382df4b591306b087e102f7c60aabe079c7869bf13","abstract_canon_sha256":"2a765cc5533c9d5e8ef9c699e63b199144bb2c2c0041a4a1cb6020dcc407bc50"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-27T01:05:54.609549Z","signature_b64":"DM4CzPnTBixBX9Ghe3ZadMKOx6/Z8w1H8Zpsg51KI7+KfkX9uaSZr9TzfHGAoJheTOUOBU+JaJl/AiAOzgAaCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"62f2a721fe61cc298d31918cc3ca795932f0413baad48cfb02a6db9a25f5e6d4","last_reissued_at":"2026-05-27T01:05:54.608818Z","signature_status":"signed_v1","first_computed_at":"2026-05-27T01:05:54.608818Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"FiRe: Fine-grained Multimodal Reasoning for Enhanced Image Generation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Decomposing text prompts into semantic units and verifying each via visual questions lets multimodal models refine generated images with targeted fixes.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Hyomin Kim, Jeeyoung Yun, Minjun Kim, Sungwoong Kim, Yerin Kim, Yongjin Kim, Yoonjin Oh, Yujung Heo","submitted_at":"2026-04-15T05:24:29Z","abstract_excerpt":"With the rapid progress of Multimodal Large Language Models (MLLMs), unified MLLMs that jointly perform image understanding and generation have advanced significantly. However, despite the inherent reasoning capabilities of unified MLLMs for self-reflection and self-refinement, their use in text-to-image generation remains largely underexplored. Meanwhile, existing multimodal reasoning-based image generation methods mostly rely on prompt augmentation or holistic image-text alignment judgments, without fine-grained reflection and refinement of detailed prompt attributes, leading to limited fine"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"FiMR consistently outperforms image generation baselines, including reasoning-based methods, particularly on compositional text-to-image benchmarks.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That VQA-based verification of decomposed prompt units produces reliable, unbiased fine-grained feedback that leads to targeted improvements without introducing new errors or hallucinations.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"FiMR improves text-to-image alignment by breaking prompts into minimal units, verifying each with VQA, and making localized refinements using MLLM reasoning.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Decomposing text prompts into semantic units and verifying each via visual questions lets multimodal models refine generated images with targeted fixes.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"f4902c9effef2608eecb17bebec38fe11e89f31942a0cd545226f1e110252fed"},"source":{"id":"2604.13491","kind":"arxiv","version":3},"verdict":{"id":"c0f708e5-c9eb-4da4-81cb-d9f5a7ec0398","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-10T14:28:25.187652Z","strongest_claim":"FiMR consistently outperforms image generation baselines, including reasoning-based methods, particularly on compositional text-to-image benchmarks.","one_line_summary":"FiMR improves text-to-image alignment by breaking prompts into minimal units, verifying each with VQA, and making localized refinements using MLLM reasoning.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That VQA-based verification of decomposed prompt units produces reliable, unbiased fine-grained feedback that leads to targeted improvements without introducing new errors or hallucinations.","pith_extraction_headline":"Decomposing text prompts into semantic units and verifying each via visual questions lets multimodal models refine generated images with targeted fixes."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2604.13491/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2604.13491","created_at":"2026-05-27T01:05:54.608929+00:00"},{"alias_kind":"arxiv_version","alias_value":"2604.13491v3","created_at":"2026-05-27T01:05:54.608929+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.13491","created_at":"2026-05-27T01:05:54.608929+00:00"},{"alias_kind":"pith_short_12","alias_value":"MLZKOIP6MHGC","created_at":"2026-05-27T01:05:54.608929+00:00"},{"alias_kind":"pith_short_16","alias_value":"MLZKOIP6MHGCTDJR","created_at":"2026-05-27T01:05:54.608929+00:00"},{"alias_kind":"pith_short_8","alias_value":"MLZKOIP6","created_at":"2026-05-27T01:05:54.608929+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/MLZKOIP6MHGCTDJRSGGMHSTZLE","json":"https://pith.science/pith/MLZKOIP6MHGCTDJRSGGMHSTZLE.json","graph_json":"https://pith.science/api/pith-number/MLZKOIP6MHGCTDJRSGGMHSTZLE/graph.json","events_json":"https://pith.science/api/pith-number/MLZKOIP6MHGCTDJRSGGMHSTZLE/events.json","paper":"https://pith.science/paper/MLZKOIP6"},"agent_actions":{"view_html":"https://pith.science/pith/MLZKOIP6MHGCTDJRSGGMHSTZLE","download_json":"https://pith.science/pith/MLZKOIP6MHGCTDJRSGGMHSTZLE.json","view_paper":"https://pith.science/paper/MLZKOIP6","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2604.13491&json=true","fetch_graph":"https://pith.science/api/pith-number/MLZKOIP6MHGCTDJRSGGMHSTZLE/graph.json","fetch_events":"https://pith.science/api/pith-number/MLZKOIP6MHGCTDJRSGGMHSTZLE/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/MLZKOIP6MHGCTDJRSGGMHSTZLE/action/timestamp_anchor","attest_storage":"https://pith.science/pith/MLZKOIP6MHGCTDJRSGGMHSTZLE/action/storage_attestation","attest_author":"https://pith.science/pith/MLZKOIP6MHGCTDJRSGGMHSTZLE/action/author_attestation","sign_citation":"https://pith.science/pith/MLZKOIP6MHGCTDJRSGGMHSTZLE/action/citation_signature","submit_replication":"https://pith.science/pith/MLZKOIP6MHGCTDJRSGGMHSTZLE/action/replication_record"}},"created_at":"2026-05-27T01:05:54.608929+00:00","updated_at":"2026-05-27T01:05:54.608929+00:00"}