{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:JB5TOC7QDDQ53QAVZVMBXETMM7","short_pith_number":"pith:JB5TOC7Q","schema_version":"1.0","canonical_sha256":"487b370bf018e1ddc015cd581b926c67e63e2ed656d596b8bef951a76f70e7c4","source":{"kind":"arxiv","id":"2605.16740","version":1},"attestation_state":"computed","paper":{"title":"TRACE: Evidence Grounding-Guided Multi-Video Event Understanding and Claim Generation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"TRACE grounds evidence in text-searchable timelines before visual reasoning for multi-video events.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Abdul Wasi, Akhil Gorugantu, David Doermann, Mahesh Bhosale, Pengyu Yan, Vishvesh Trivedi","submitted_at":"2026-05-16T01:37:10Z","abstract_excerpt":"Multi-video event understanding demands models that can locate and attribute query-relevant evidence scattered across long, heterogeneous video corpora. Existing large vision-language models (LVLMs) often underperform in this regime because they quickly exhaust their context budget and struggle to precisely localize evidentially important segments, frequently missing dense informational cues such as broadcast graphics, subtitles, and scoreboards. We introduce TRACE, an evidence grounding-guided framework that follows a ground-before-reasoning strategy for multi-video event reasoning. Our appro"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.16740","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-16T01:37:10Z","cross_cats_sorted":[],"title_canon_sha256":"30973130461cc3c7190208868ff913d5e109498b56748e2eb099c2e3e66a2b38","abstract_canon_sha256":"9a910ccaaf755069dddeda69068f9352cbc58bd13c4b3a9a96e7083cc93d7a23"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:02:39.252433Z","signature_b64":"YCzRuyuUvHi4hGCnpck6nI+bTMw78jFcPdEREGA98yc/7ZZHfICKz2rSdw8l3UlhYKyHyRkIrNmuOicW7TpZAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"487b370bf018e1ddc015cd581b926c67e63e2ed656d596b8bef951a76f70e7c4","last_reissued_at":"2026-05-20T00:02:39.251465Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:02:39.251465Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"TRACE: Evidence Grounding-Guided Multi-Video Event Understanding and Claim Generation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"TRACE grounds evidence in text-searchable timelines before visual reasoning for multi-video events.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Abdul Wasi, Akhil Gorugantu, David Doermann, Mahesh Bhosale, Pengyu Yan, Vishvesh Trivedi","submitted_at":"2026-05-16T01:37:10Z","abstract_excerpt":"Multi-video event understanding demands models that can locate and attribute query-relevant evidence scattered across long, heterogeneous video corpora. Existing large vision-language models (LVLMs) often underperform in this regime because they quickly exhaust their context budget and struggle to precisely localize evidentially important segments, frequently missing dense informational cues such as broadcast graphics, subtitles, and scoreboards. We introduce TRACE, an evidence grounding-guided framework that follows a ground-before-reasoning strategy for multi-video event reasoning. Our appro"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"TRACE raises macro-average MiRAGE F1 from 0.705 to 0.811 compared to an unguided Qwen3-VL-30B baseline, with especially strong improvements in citation recall from 0.440 to 0.628 on the MAGMaR validation split.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The method assumes that OCR and object detection produce sufficiently accurate and complete structured timelines, and that a text-only LLM can reliably select query-relevant moments without missing critical visual cues not captured in text (abstract, method description paragraph).","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"TRACE builds structured text timelines from videos via OCR and detection, then applies text-only LLM evidence localization before LVLM claim generation, raising MiRAGE F1 from 0.705 to 0.811 on MAGMaR.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"TRACE grounds evidence in text-searchable timelines before visual reasoning for multi-video events.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"f269452c89ed68e57263dc6c4a5564768a91e5301d7df20bdb3b059a7745dc93"},"source":{"id":"2605.16740","kind":"arxiv","version":1},"verdict":{"id":"17ca875e-ebeb-4d5b-9819-5f2b6e7c5204","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-19T21:33:02.076905Z","strongest_claim":"TRACE raises macro-average MiRAGE F1 from 0.705 to 0.811 compared to an unguided Qwen3-VL-30B baseline, with especially strong improvements in citation recall from 0.440 to 0.628 on the MAGMaR validation split.","one_line_summary":"TRACE builds structured text timelines from videos via OCR and detection, then applies text-only LLM evidence localization before LVLM claim generation, raising MiRAGE F1 from 0.705 to 0.811 on MAGMaR.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The method assumes that OCR and object detection produce sufficiently accurate and complete structured timelines, and that a text-only LLM can reliably select query-relevant moments without missing critical visual cues not captured in text (abstract, method description paragraph).","pith_extraction_headline":"TRACE grounds evidence in text-searchable timelines before visual reasoning for multi-video events."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.16740/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"doi_title_agreement","ran_at":"2026-05-19T22:01:19.874472Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T21:40:55.392904Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"claim_evidence","ran_at":"2026-05-19T19:01:56.334872Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"ai_meta_artifact","ran_at":"2026-05-19T18:33:26.464089Z","status":"skipped","version":"1.0.0","findings_count":0}],"snapshot_sha256":"7705237be83d5f0181c9406cee3533dd3aab2162bc249c388a84e0aee1adc835"},"references":{"count":14,"sample":[{"doi":"","year":2009,"title":"PP-OCR: A practical ultra lightweight OCR system.CoRR, abs/2009.09941","work_id":"555e4547-6e8f-49f5-acf4-0adae9b359e3","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Video-MME: The First-Ever Comprehensive Evaluation Benchmark of Multi-modal LLMs in Video Analysis","work_id":"77fd5ac9-ae98-4846-9d83-e9c73c8f2a52","ref_index":2,"cited_arxiv_id":"2405.21075","is_internal_anchor":true},{"doi":"","year":null,"title":"Verify exact arXiv ID and au- thor list on Scholar","work_id":"97fb0584-00dd-4710-bc5c-10fad53898d3","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"VideoChat: Chat-Centric Video Understanding","work_id":"07461eec-156c-4054-a28e-b84bc53bf6e1","ref_index":4,"cited_arxiv_id":"2305.06355","is_internal_anchor":true},{"doi":"","year":null,"title":"Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection","work_id":"3757dc8f-79d5-4beb-a03b-eb4c9a33427d","ref_index":5,"cited_arxiv_id":"2303.05499","is_internal_anchor":true}],"resolved_work":14,"snapshot_sha256":"7634dc8b77249c43ee82c60848dd971f5071fb2ddfbc5777ccc8f2d30f186544","internal_anchors":8},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.16740","created_at":"2026-05-20T00:02:39.251637+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.16740v1","created_at":"2026-05-20T00:02:39.251637+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.16740","created_at":"2026-05-20T00:02:39.251637+00:00"},{"alias_kind":"pith_short_12","alias_value":"JB5TOC7QDDQ5","created_at":"2026-05-20T00:02:39.251637+00:00"},{"alias_kind":"pith_short_16","alias_value":"JB5TOC7QDDQ53QAV","created_at":"2026-05-20T00:02:39.251637+00:00"},{"alias_kind":"pith_short_8","alias_value":"JB5TOC7Q","created_at":"2026-05-20T00:02:39.251637+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/JB5TOC7QDDQ53QAVZVMBXETMM7","json":"https://pith.science/pith/JB5TOC7QDDQ53QAVZVMBXETMM7.json","graph_json":"https://pith.science/api/pith-number/JB5TOC7QDDQ53QAVZVMBXETMM7/graph.json","events_json":"https://pith.science/api/pith-number/JB5TOC7QDDQ53QAVZVMBXETMM7/events.json","paper":"https://pith.science/paper/JB5TOC7Q"},"agent_actions":{"view_html":"https://pith.science/pith/JB5TOC7QDDQ53QAVZVMBXETMM7","download_json":"https://pith.science/pith/JB5TOC7QDDQ53QAVZVMBXETMM7.json","view_paper":"https://pith.science/paper/JB5TOC7Q","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.16740&json=true","fetch_graph":"https://pith.science/api/pith-number/JB5TOC7QDDQ53QAVZVMBXETMM7/graph.json","fetch_events":"https://pith.science/api/pith-number/JB5TOC7QDDQ53QAVZVMBXETMM7/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/JB5TOC7QDDQ53QAVZVMBXETMM7/action/timestamp_anchor","attest_storage":"https://pith.science/pith/JB5TOC7QDDQ53QAVZVMBXETMM7/action/storage_attestation","attest_author":"https://pith.science/pith/JB5TOC7QDDQ53QAVZVMBXETMM7/action/author_attestation","sign_citation":"https://pith.science/pith/JB5TOC7QDDQ53QAVZVMBXETMM7/action/citation_signature","submit_replication":"https://pith.science/pith/JB5TOC7QDDQ53QAVZVMBXETMM7/action/replication_record"}},"created_at":"2026-05-20T00:02:39.251637+00:00","updated_at":"2026-05-20T00:02:39.251637+00:00"}