{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:7L2LCIDIL22PSJFQG4OCMYKWBU","short_pith_number":"pith:7L2LCIDI","schema_version":"1.0","canonical_sha256":"faf4b120685eb4f924b0371c2661560d14b994c3b8f88f9e5423bea254dd3710","source":{"kind":"arxiv","id":"2307.06942","version":2},"attestation_state":"computed","paper":{"title":"InternVid: A Large-scale Video-Text Dataset for Multimodal Understanding and Generation","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"A scalable LLM-based method creates a 7 million video dataset that trains models with leading zero-shot action recognition.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Conghui He, Guo Chen, Jiashuo Yu, Kunchang Li, Limin Wang, Ping Luo, Xinhao Li, Xin Ma, Xinyuan Chen, Yali Wang, Yaohui Wang, Yinan He, Yi Wang, Yizhuo Li, Yu Qiao, Ziwei Liu","submitted_at":"2023-07-13T17:58:32Z","abstract_excerpt":"This paper introduces InternVid, a large-scale video-centric multimodal dataset that enables learning powerful and transferable video-text representations for multimodal understanding and generation. The InternVid dataset contains over 7 million videos lasting nearly 760K hours, yielding 234M video clips accompanied by detailed descriptions of total 4.1B words. Our core contribution is to develop a scalable approach to autonomously build a high-quality video-text dataset with large language models (LLM), thereby showcasing its efficacy in learning video-language representation at scale. Specif"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2307.06942","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2023-07-13T17:58:32Z","cross_cats_sorted":[],"title_canon_sha256":"ed56d583d0a3dc7471844fffe8f1ec2c462996e58f23d6c646e929ebea61ff5e","abstract_canon_sha256":"66fa1d6696b6cee9acb169649d8eae60a2ccf732088357b099f5377e1f284a88"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:53.259998Z","signature_b64":"ATooeu/8BbPC/QpN1+J2HXZ7GzHc2yopAGc/l569k14FCex/NFhC1S9UD9o0SbcqGBkRDIyXfljABWvO5PYoBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"faf4b120685eb4f924b0371c2661560d14b994c3b8f88f9e5423bea254dd3710","last_reissued_at":"2026-05-17T23:38:53.259393Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:53.259393Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"InternVid: A Large-scale Video-Text Dataset for Multimodal Understanding and Generation","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"A scalable LLM-based method creates a 7 million video dataset that trains models with leading zero-shot action recognition.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Conghui He, Guo Chen, Jiashuo Yu, Kunchang Li, Limin Wang, Ping Luo, Xinhao Li, Xin Ma, Xinyuan Chen, Yali Wang, Yaohui Wang, Yinan He, Yi Wang, Yizhuo Li, Yu Qiao, Ziwei Liu","submitted_at":"2023-07-13T17:58:32Z","abstract_excerpt":"This paper introduces InternVid, a large-scale video-centric multimodal dataset that enables learning powerful and transferable video-text representations for multimodal understanding and generation. The InternVid dataset contains over 7 million videos lasting nearly 760K hours, yielding 234M video clips accompanied by detailed descriptions of total 4.1B words. Our core contribution is to develop a scalable approach to autonomously build a high-quality video-text dataset with large language models (LLM), thereby showcasing its efficacy in learning video-language representation at scale. Specif"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Learned on InternVid via contrastive learning, this model demonstrates leading zero-shot action recognition and competitive video retrieval performance.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The multi-scale LLM-generated descriptions are sufficiently accurate and diverse to produce transferable video-text representations without introducing systematic biases or hallucinations that degrade downstream performance.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"InternVid supplies 7M videos and LLM captions to train ViCLIP, which reaches leading zero-shot action recognition and competitive retrieval performance.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A scalable LLM-based method creates a 7 million video dataset that trains models with leading zero-shot action recognition.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"8b2cd0744bf0778d07ceacb7d7a7aa71e00906bb41fbb6ecccde7bc4a5c493f8"},"source":{"id":"2307.06942","kind":"arxiv","version":2},"verdict":{"id":"d1526bfd-4cd9-4a2d-b042-3a4fe0929739","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T06:25:54.338059Z","strongest_claim":"Learned on InternVid via contrastive learning, this model demonstrates leading zero-shot action recognition and competitive video retrieval performance.","one_line_summary":"InternVid supplies 7M videos and LLM captions to train ViCLIP, which reaches leading zero-shot action recognition and competitive retrieval performance.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The multi-scale LLM-generated descriptions are sufficiently accurate and diverse to produce transferable video-text representations without introducing systematic biases or hallucinations that degrade downstream performance.","pith_extraction_headline":"A scalable LLM-based method creates a 7 million video dataset that trains models with leading zero-shot action recognition."},"references":{"count":82,"sample":[{"doi":"","year":1901,"title":"Language models are few-shot learners","work_id":"b50c9b32-76fe-43d5-b25e-cb27d397e9fd","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2019,"title":"Howto100m: Learning a text-video embedding by watching hundred million narrated video clips","work_id":"4dbd9fea-5e22-428e-af10-fffaa570fb86","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Advancing high-resolution video-language representation with large-scale video transcriptions","work_id":"555e923f-999c-40c6-858a-04eb595d89e2","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"Merlot: Multimodal neural script knowledge models","work_id":"cb6de4ab-804d-4ccc-af6b-0ccd5fcd09c9","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Merlot reserve: Neural script knowledge through vision and language and sound","work_id":"6b00c0e3-0506-4e5c-a0fb-2a1e20c58aef","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":82,"snapshot_sha256":"a88efe24071128222f0819d789630685a1cab4688d2d5cc5735df371f8f7842e","internal_anchors":13},"formal_canon":{"evidence_count":2,"snapshot_sha256":"f7f29e802f0c2d67d186bd138e08c6024a5b4da3f85136781187be6252d36b1b"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2307.06942","created_at":"2026-05-17T23:38:53.259493+00:00"},{"alias_kind":"arxiv_version","alias_value":"2307.06942v2","created_at":"2026-05-17T23:38:53.259493+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2307.06942","created_at":"2026-05-17T23:38:53.259493+00:00"},{"alias_kind":"pith_short_12","alias_value":"7L2LCIDIL22P","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_16","alias_value":"7L2LCIDIL22PSJFQ","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_8","alias_value":"7L2LCIDI","created_at":"2026-05-18T12:33:33.725879+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":37,"internal_anchor_count":37,"sample":[{"citing_arxiv_id":"2605.23610","citing_title":"EM-Vid: Training-Free Entity-Centric Memory for Efficient and Consistent Multi-Shot Video Generation","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2412.15689","citing_title":"DOLLAR: Few-Step Video Generation via Distillation and Latent Reward Optimization","ref_index":59,"is_internal_anchor":true},{"citing_arxiv_id":"2406.03520","citing_title":"VideoPhy: Evaluating Physical Commonsense for Video Generation","ref_index":106,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19223","citing_title":"HAVEN: Hierarchically Aligned Multimodal Benchmark for Unified Video Understanding","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19728","citing_title":"Aero-World: Action-Conditioned Aerial Video Generation from Inertial Controls","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2508.05269","citing_title":"B4DL: A Benchmark for 4D LiDAR LLM in Spatio-Temporal Understanding","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2509.16538","citing_title":"VC-Inspector: Advancing Reference-free Evaluation of Video Captions with Factual Analysis","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2402.15852","citing_title":"NaVid: Video-based VLM Plans the Next Step for Vision-and-Language Navigation","ref_index":102,"is_internal_anchor":true},{"citing_arxiv_id":"2512.00336","citing_title":"MVAD: A Benchmark Dataset for Multimodal AI-Generated Video-Audio Detection","ref_index":50,"is_internal_anchor":true},{"citing_arxiv_id":"2501.12386","citing_title":"InternVideo2.5: Empowering Video MLLMs with Long and Rich Context Modeling","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2503.13377","citing_title":"Time-R1: Post-Training Large Vision Language Model for Temporal Video Grounding","ref_index":49,"is_internal_anchor":true},{"citing_arxiv_id":"2512.09299","citing_title":"VABench: A Comprehensive Benchmark for Audio-Video Generation","ref_index":50,"is_internal_anchor":true},{"citing_arxiv_id":"2512.13511","citing_title":"Adapting MLLMs for Nuanced Video Retrieval","ref_index":73,"is_internal_anchor":true},{"citing_arxiv_id":"2502.06608","citing_title":"TripoSG: High-Fidelity 3D Shape Synthesis using Large-Scale Rectified Flow Models","ref_index":127,"is_internal_anchor":true},{"citing_arxiv_id":"2402.08268","citing_title":"World Model on Million-Length Video And Language With Blockwise RingAttention","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2602.22779","citing_title":"TrajTok: Learning Trajectory Tokens enables better Video Understanding","ref_index":72,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14136","citing_title":"TeDiO: Temporal Diagonal Optimization for Training-Free Coherent Video Diffusion","ref_index":52,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14569","citing_title":"Bridging Brain and Semantics: A Hierarchical Framework for Semantically Enhanced fMRI-to-Video Reconstruction","ref_index":103,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12957","citing_title":"GTA: Advancing Image-to-3D World Generation via Geometry Then Appearance Video Diffusion","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2312.14238","citing_title":"InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks","ref_index":153,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12496","citing_title":"CausalCine: Real-Time Autoregressive Generation for Multi-Shot Video Narratives","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26565","citing_title":"DenseStep2M: A Scalable, Training-Free Pipeline for Dense Instructional Video Annotation","ref_index":74,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09976","citing_title":"OZ-TAL: Online Zero-Shot Temporal Action Localization","ref_index":53,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23789","citing_title":"MuSS: A Large-Scale Dataset and Cinematic Narrative Benchmark for Multi-Shot Subject-to-Video Generation","ref_index":42,"is_internal_anchor":true},{"citing_arxiv_id":"2604.25819","citing_title":"Mutual Forcing: Dual-Mode Self-Evolution for Fast Autoregressive Audio-Video Character Generation","ref_index":49,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/7L2LCIDIL22PSJFQG4OCMYKWBU","json":"https://pith.science/pith/7L2LCIDIL22PSJFQG4OCMYKWBU.json","graph_json":"https://pith.science/api/pith-number/7L2LCIDIL22PSJFQG4OCMYKWBU/graph.json","events_json":"https://pith.science/api/pith-number/7L2LCIDIL22PSJFQG4OCMYKWBU/events.json","paper":"https://pith.science/paper/7L2LCIDI"},"agent_actions":{"view_html":"https://pith.science/pith/7L2LCIDIL22PSJFQG4OCMYKWBU","download_json":"https://pith.science/pith/7L2LCIDIL22PSJFQG4OCMYKWBU.json","view_paper":"https://pith.science/paper/7L2LCIDI","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2307.06942&json=true","fetch_graph":"https://pith.science/api/pith-number/7L2LCIDIL22PSJFQG4OCMYKWBU/graph.json","fetch_events":"https://pith.science/api/pith-number/7L2LCIDIL22PSJFQG4OCMYKWBU/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/7L2LCIDIL22PSJFQG4OCMYKWBU/action/timestamp_anchor","attest_storage":"https://pith.science/pith/7L2LCIDIL22PSJFQG4OCMYKWBU/action/storage_attestation","attest_author":"https://pith.science/pith/7L2LCIDIL22PSJFQG4OCMYKWBU/action/author_attestation","sign_citation":"https://pith.science/pith/7L2LCIDIL22PSJFQG4OCMYKWBU/action/citation_signature","submit_replication":"https://pith.science/pith/7L2LCIDIL22PSJFQG4OCMYKWBU/action/replication_record"}},"created_at":"2026-05-17T23:38:53.259493+00:00","updated_at":"2026-05-17T23:38:53.259493+00:00"}