{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:7EEIF6IW46QKWYCE6JV7UAV4YB","short_pith_number":"pith:7EEIF6IW","canonical_record":{"source":{"id":"2503.13377","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-03-17T17:04:20Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"97209bd8163ae50201f4f63563823b1191ded3d066d238972ab094ea7017c1da","abstract_canon_sha256":"c47461ba7328122a8cf3804f43f934a641731cf10e3ddf62d5cb9ec145d00de7"},"schema_version":"1.0"},"canonical_sha256":"f90882f916e7a0ab6044f26bfa02bcc0583d66f90901bbdde83aedd8fcbce415","source":{"kind":"arxiv","id":"2503.13377","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2503.13377","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"arxiv_version","alias_value":"2503.13377v3","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2503.13377","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"pith_short_12","alias_value":"7EEIF6IW46QK","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"7EEIF6IW46QKWYCE","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"7EEIF6IW","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:7EEIF6IW46QKWYCE6JV7UAV4YB","target":"record","payload":{"canonical_record":{"source":{"id":"2503.13377","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-03-17T17:04:20Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"97209bd8163ae50201f4f63563823b1191ded3d066d238972ab094ea7017c1da","abstract_canon_sha256":"c47461ba7328122a8cf3804f43f934a641731cf10e3ddf62d5cb9ec145d00de7"},"schema_version":"1.0"},"canonical_sha256":"f90882f916e7a0ab6044f26bfa02bcc0583d66f90901bbdde83aedd8fcbce415","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:15.371202Z","signature_b64":"vyk2HrncO/DFVgjHT2Uh0mEdCxCbyx0x2arBnbRmlEJSCSgNMQEVmArMbUaVoq597ScmHXmoOfW+w+YPXddcCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f90882f916e7a0ab6044f26bfa02bcc0583d66f90901bbdde83aedd8fcbce415","last_reissued_at":"2026-05-17T23:38:15.370647Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:15.370647Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2503.13377","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:15Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"TinBRJ23N9raWLNm6OAe6aryPBiODOWht0az8mBujBYcnApstWcWvpK0moVBJIBPe7DsRNmQBIlY0rpF0xfpBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-21T20:35:44.970631Z"},"content_sha256":"ec4d152127f8bee3cbd62850e716ca465ed2b9594b1c76cd2bc392ee2ed3b1bd","schema_version":"1.0","event_id":"sha256:ec4d152127f8bee3cbd62850e716ca465ed2b9594b1c76cd2bc392ee2ed3b1bd"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:7EEIF6IW46QKWYCE6JV7UAV4YB","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Time-R1: Post-Training Large Vision Language Model for Temporal Video Grounding","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Reinforcement learning post-training enables large vision-language models to achieve state-of-the-art temporal video grounding with only 2.5K training examples.","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.CV","authors_text":"Boshen Xu, Dingyi Yang, Jian Luan, Jianzhong Ju, Junqi Lin, Kejun Lin, Liang Zhang, Qin Jin, Wenxuan Wang, Xiangnan Fang, Yang Du, Ye Wang, Zewen He, Zhenbo Luo, Zihan Xiao, Zihao Yue, Ziheng Wang","submitted_at":"2025-03-17T17:04:20Z","abstract_excerpt":"Temporal Video Grounding (TVG), the task of locating specific video segments based on language queries, is a core challenge in long-form video understanding. While recent Large Vision-Language Models (LVLMs) have shown early promise in tackling TVG through supervised fine-tuning (SFT), their abilities to generalize remain limited. To address this, we propose a novel post-training framework that enhances the generalization capabilities of LVLMs via reinforcement learning (RL). Specifically, our contributions span three key directions: (1) Time-R1: we introduce a reasoning-guided post-training f"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Time-R1 achieves state-of-the-art performance across multiple downstream datasets using only 2.5K training data, while improving its general video understanding capabilities.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That reinforcement learning with verifiable rewards on the curated RL-friendly dataset will produce genuine generalization improvements rather than overfitting to the specific reward formulation or benchmark construction.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Time-R1 applies RL with verifiable rewards to post-train LVLMs for temporal video grounding, reaching state-of-the-art results on multiple datasets using only 2.5K samples while also improving general video capabilities.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Reinforcement learning post-training enables large vision-language models to achieve state-of-the-art temporal video grounding with only 2.5K training examples.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"b55102ecade32d8548ddc99a911d88eb186c8461438dcbaeee3207f98dafaf50"},"source":{"id":"2503.13377","kind":"arxiv","version":3},"verdict":{"id":"9e8fe946-10eb-4b1b-a214-b9772e061e94","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T02:34:23.564245Z","strongest_claim":"Time-R1 achieves state-of-the-art performance across multiple downstream datasets using only 2.5K training data, while improving its general video understanding capabilities.","one_line_summary":"Time-R1 applies RL with verifiable rewards to post-train LVLMs for temporal video grounding, reaching state-of-the-art results on multiple datasets using only 2.5K samples while also improving general video capabilities.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That reinforcement learning with verifiable rewards on the curated RL-friendly dataset will produce genuine generalization improvements rather than overfitting to the specific reward formulation or benchmark construction.","pith_extraction_headline":"Reinforcement learning post-training enables large vision-language models to achieve state-of-the-art temporal video grounding with only 2.5K training examples."},"references":{"count":87,"sample":[{"doi":"","year":2025,"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","ref_index":1,"cited_arxiv_id":"2501.12948","is_internal_anchor":true},{"doi":"","year":2023,"title":"Ht- step: Aligning instructional articles with how-to videos","work_id":"8245457b-ee59-461c-8839-ff57852b9855","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2017,"title":"Localizing moments in video with natural language","work_id":"00e6da62-472c-45ac-a3ba-63660f0581c5","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","ref_index":4,"cited_arxiv_id":"2502.13923","is_internal_anchor":true},{"doi":"","year":2015,"title":"Activitynet: A large-scale video benchmark for human activity understanding","work_id":"2f0f351a-b69d-4767-8213-6807af5fda95","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":87,"snapshot_sha256":"cbbe0739cbd3bc94e71dddd357a39e6d354506aab1108973d3319486c959c32c","internal_anchors":10},"formal_canon":{"evidence_count":3,"snapshot_sha256":"f408f7366e7e837e684a095f79a0910b1eded4116ad1df352b74af930bc00337"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"9e8fe946-10eb-4b1b-a214-b9772e061e94"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:15Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"fKb9sZpYsQB3u62bb0az+imDVExEMHULsEZ5r5U7WRxCFzNZ2dlNxQO5SiQu23F1naQCISevjgeA54FRS1EbAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-21T20:35:44.971170Z"},"content_sha256":"4d4d67855eda361ffc69f61a888923286e1fb58c71cf1f5e7dd0d05818af002b","schema_version":"1.0","event_id":"sha256:4d4d67855eda361ffc69f61a888923286e1fb58c71cf1f5e7dd0d05818af002b"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/7EEIF6IW46QKWYCE6JV7UAV4YB/bundle.json","state_url":"https://pith.science/pith/7EEIF6IW46QKWYCE6JV7UAV4YB/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/7EEIF6IW46QKWYCE6JV7UAV4YB/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-21T20:35:44Z","links":{"resolver":"https://pith.science/pith/7EEIF6IW46QKWYCE6JV7UAV4YB","bundle":"https://pith.science/pith/7EEIF6IW46QKWYCE6JV7UAV4YB/bundle.json","state":"https://pith.science/pith/7EEIF6IW46QKWYCE6JV7UAV4YB/state.json","well_known_bundle":"https://pith.science/.well-known/pith/7EEIF6IW46QKWYCE6JV7UAV4YB/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:7EEIF6IW46QKWYCE6JV7UAV4YB","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"c47461ba7328122a8cf3804f43f934a641731cf10e3ddf62d5cb9ec145d00de7","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-03-17T17:04:20Z","title_canon_sha256":"97209bd8163ae50201f4f63563823b1191ded3d066d238972ab094ea7017c1da"},"schema_version":"1.0","source":{"id":"2503.13377","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2503.13377","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"arxiv_version","alias_value":"2503.13377v3","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2503.13377","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"pith_short_12","alias_value":"7EEIF6IW46QK","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"7EEIF6IW46QKWYCE","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"7EEIF6IW","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:4d4d67855eda361ffc69f61a888923286e1fb58c71cf1f5e7dd0d05818af002b","target":"graph","created_at":"2026-05-17T23:38:15Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Time-R1 achieves state-of-the-art performance across multiple downstream datasets using only 2.5K training data, while improving its general video understanding capabilities."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That reinforcement learning with verifiable rewards on the curated RL-friendly dataset will produce genuine generalization improvements rather than overfitting to the specific reward formulation or benchmark construction."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Time-R1 applies RL with verifiable rewards to post-train LVLMs for temporal video grounding, reaching state-of-the-art results on multiple datasets using only 2.5K samples while also improving general video capabilities."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Reinforcement learning post-training enables large vision-language models to achieve state-of-the-art temporal video grounding with only 2.5K training examples."}],"snapshot_sha256":"b55102ecade32d8548ddc99a911d88eb186c8461438dcbaeee3207f98dafaf50"},"formal_canon":{"evidence_count":3,"snapshot_sha256":"f408f7366e7e837e684a095f79a0910b1eded4116ad1df352b74af930bc00337"},"paper":{"abstract_excerpt":"Temporal Video Grounding (TVG), the task of locating specific video segments based on language queries, is a core challenge in long-form video understanding. While recent Large Vision-Language Models (LVLMs) have shown early promise in tackling TVG through supervised fine-tuning (SFT), their abilities to generalize remain limited. To address this, we propose a novel post-training framework that enhances the generalization capabilities of LVLMs via reinforcement learning (RL). Specifically, our contributions span three key directions: (1) Time-R1: we introduce a reasoning-guided post-training f","authors_text":"Boshen Xu, Dingyi Yang, Jian Luan, Jianzhong Ju, Junqi Lin, Kejun Lin, Liang Zhang, Qin Jin, Wenxuan Wang, Xiangnan Fang, Yang Du, Ye Wang, Zewen He, Zhenbo Luo, Zihan Xiao, Zihao Yue, Ziheng Wang","cross_cats":["cs.AI","cs.CL"],"headline":"Reinforcement learning post-training enables large vision-language models to achieve state-of-the-art temporal video grounding with only 2.5K training examples.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-03-17T17:04:20Z","title":"Time-R1: Post-Training Large Vision Language Model for Temporal Video Grounding"},"references":{"count":87,"internal_anchors":10,"resolved_work":87,"sample":[{"cited_arxiv_id":"2501.12948","doi":"","is_internal_anchor":true,"ref_index":1,"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Ht- step: Aligning instructional articles with how-to videos","work_id":"8245457b-ee59-461c-8839-ff57852b9855","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Localizing moments in video with natural language","work_id":"00e6da62-472c-45ac-a3ba-63660f0581c5","year":2017},{"cited_arxiv_id":"2502.13923","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Activitynet: A large-scale video benchmark for human activity understanding","work_id":"2f0f351a-b69d-4767-8213-6807af5fda95","year":2015}],"snapshot_sha256":"cbbe0739cbd3bc94e71dddd357a39e6d354506aab1108973d3319486c959c32c"},"source":{"id":"2503.13377","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-17T02:34:23.564245Z","id":"9e8fe946-10eb-4b1b-a214-b9772e061e94","model_set":{"reader":"grok-4.3"},"one_line_summary":"Time-R1 applies RL with verifiable rewards to post-train LVLMs for temporal video grounding, reaching state-of-the-art results on multiple datasets using only 2.5K samples while also improving general video capabilities.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Reinforcement learning post-training enables large vision-language models to achieve state-of-the-art temporal video grounding with only 2.5K training examples.","strongest_claim":"Time-R1 achieves state-of-the-art performance across multiple downstream datasets using only 2.5K training data, while improving its general video understanding capabilities.","weakest_assumption":"That reinforcement learning with verifiable rewards on the curated RL-friendly dataset will produce genuine generalization improvements rather than overfitting to the specific reward formulation or benchmark construction."}},"verdict_id":"9e8fe946-10eb-4b1b-a214-b9772e061e94"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:ec4d152127f8bee3cbd62850e716ca465ed2b9594b1c76cd2bc392ee2ed3b1bd","target":"record","created_at":"2026-05-17T23:38:15Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"c47461ba7328122a8cf3804f43f934a641731cf10e3ddf62d5cb9ec145d00de7","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-03-17T17:04:20Z","title_canon_sha256":"97209bd8163ae50201f4f63563823b1191ded3d066d238972ab094ea7017c1da"},"schema_version":"1.0","source":{"id":"2503.13377","kind":"arxiv","version":3}},"canonical_sha256":"f90882f916e7a0ab6044f26bfa02bcc0583d66f90901bbdde83aedd8fcbce415","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"f90882f916e7a0ab6044f26bfa02bcc0583d66f90901bbdde83aedd8fcbce415","first_computed_at":"2026-05-17T23:38:15.370647Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:15.370647Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"vyk2HrncO/DFVgjHT2Uh0mEdCxCbyx0x2arBnbRmlEJSCSgNMQEVmArMbUaVoq597ScmHXmoOfW+w+YPXddcCA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:15.371202Z","signed_message":"canonical_sha256_bytes"},"source_id":"2503.13377","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:ec4d152127f8bee3cbd62850e716ca465ed2b9594b1c76cd2bc392ee2ed3b1bd","sha256:4d4d67855eda361ffc69f61a888923286e1fb58c71cf1f5e7dd0d05818af002b"],"state_sha256":"d459a3254b7f0bff68941791189491773ac06f0355d4c837f570893015bd4d1e"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"D+X6xjLd9cdncialOvYuQyk7e3Y76ggBTJPqqfsc3PSOZgIiM5p6LYn5vVdrz/ITd8WG6eP7dO66wOCXQtVyDw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-21T20:35:44.973883Z","bundle_sha256":"50eebeadca2c5caec4bef76729bc0655379bfdfc634d32dca8d1c435c22e2f90"}}