{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:J2YFXZY33UMKM3KW4LC76TAFPL","short_pith_number":"pith:J2YFXZY3","canonical_record":{"source":{"id":"2604.16503","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-04-14T15:09:39Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"6e8b79a904223c58498d3f57c7a29bfacc6c6429f415b10f39c0be4dac8db7b7","abstract_canon_sha256":"69298a6784daed110a9554e6e6b78272b8737d3308231f726b583112a9e3d994"},"schema_version":"1.0"},"canonical_sha256":"4eb05be71bdd18a66d56e2c5ff4c057ae5d5ebce652b2fd009f2812f8bdce389","source":{"kind":"arxiv","id":"2604.16503","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2604.16503","created_at":"2026-05-20T01:05:13Z"},{"alias_kind":"arxiv_version","alias_value":"2604.16503v2","created_at":"2026-05-20T01:05:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.16503","created_at":"2026-05-20T01:05:13Z"},{"alias_kind":"pith_short_12","alias_value":"J2YFXZY33UMK","created_at":"2026-05-20T01:05:13Z"},{"alias_kind":"pith_short_16","alias_value":"J2YFXZY33UMKM3KW","created_at":"2026-05-20T01:05:13Z"},{"alias_kind":"pith_short_8","alias_value":"J2YFXZY3","created_at":"2026-05-20T01:05:13Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:J2YFXZY33UMKM3KW4LC76TAFPL","target":"record","payload":{"canonical_record":{"source":{"id":"2604.16503","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-04-14T15:09:39Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"6e8b79a904223c58498d3f57c7a29bfacc6c6429f415b10f39c0be4dac8db7b7","abstract_canon_sha256":"69298a6784daed110a9554e6e6b78272b8737d3308231f726b583112a9e3d994"},"schema_version":"1.0"},"canonical_sha256":"4eb05be71bdd18a66d56e2c5ff4c057ae5d5ebce652b2fd009f2812f8bdce389","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T01:05:13.744273Z","signature_b64":"7eIoNp5aeKvZr/GSYBRtbx5ZP2huQ6wUN12fEkT04JUnPH9bb0ALOnsrCC1aqTzeYcIzQ8cf1VmBL5uCZj/1Ag==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"4eb05be71bdd18a66d56e2c5ff4c057ae5d5ebce652b2fd009f2812f8bdce389","last_reissued_at":"2026-05-20T01:05:13.743500Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T01:05:13.743500Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2604.16503","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T01:05:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"tWZvLeA2E7z5dhzGyiMuGKR12BXcNC2gGk4dwLBEa2fUYx+62L8mO6xoYPmK5HcC/TsyyL7jassal0fgqMrVAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-03T01:32:21.601593Z"},"content_sha256":"b71f500711578f35f0965455c478e9224c19447a76010163daa8a4a9658703fc","schema_version":"1.0","event_id":"sha256:b71f500711578f35f0965455c478e9224c19447a76010163daa8a4a9658703fc"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:J2YFXZY33UMKM3KW4LC76TAFPL","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Motif-Video 2B: Technical Report","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Separating prompt alignment, temporal consistency, and fine-detail recovery into distinct stages lets a 2B video model outperform a 14B baseline on VBench with far less data and compute.","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Beomgyu Kim, Bokki Ryu, Changjin Kang, Dahye Choi, Dongjoo Weon, Dongpin Oh, Dongseok Kim, Eunhwan Park, Haesol Lee, Hanbin Jung, Hongjoo Lee, Hyeyeon Cho, Hyukjin Kweon, Jaeheui Her, Jaeyeon Huh, Jangwoong Kim, Jeesoo Lee, Jeongdoo Lee, Junghwan Lim, Junhyeok Lee, Minjae Kim, Minsu Ha, Sungmin Lee, Taehyun Kim, Taewhan Kim, Wai Ting Cheung, Yeongjae Park, Youngrok Kim","submitted_at":"2026-04-14T15:09:39Z","abstract_excerpt":"Training strong video generation models usually requires massive datasets, large parameter counts, and substantial compute. In this work, we ask whether strong text-to-video quality is possible at a much smaller budget: fewer than 10M clips and less than 100,000 H200 GPU hours. Our core claim is that part of the answer lies in how model capacity is organized, not only in how much of it is used. In video generation, prompt alignment, temporal consistency, and fine-detail recovery can interfere with one another when they are handled through the same pathway. Motif-Video 2B addresses this by sepa"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"On VBench, Motif-Video~2B reaches 83.76%, surpassing Wan2.1 14B while using 7× fewer parameters and substantially less training data.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That separating prompt alignment, temporal consistency, and fine-detail recovery into distinct architectural stages actually reduces interference and drives the observed gains, rather than the gains arising mainly from the dynamic token routing, early feature alignment, or other unstated aspects of the training recipe.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Motif-Video 2B achieves 83.76% VBench score, beating a 14B-parameter baseline with 7x fewer parameters and substantially less training data through shared cross-attention and a three-part backbone.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Separating prompt alignment, temporal consistency, and fine-detail recovery into distinct stages lets a 2B video model outperform a 14B baseline on VBench with far less data and compute.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"eb9fab6ef52412df059910ebdd49605289c60590d1387d33e7d4b0bdbb7d533e"},"source":{"id":"2604.16503","kind":"arxiv","version":2},"verdict":{"id":"f5e34d16-5585-422d-a337-c37c353cf543","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-10T15:46:21.654550Z","strongest_claim":"On VBench, Motif-Video~2B reaches 83.76%, surpassing Wan2.1 14B while using 7× fewer parameters and substantially less training data.","one_line_summary":"Motif-Video 2B achieves 83.76% VBench score, beating a 14B-parameter baseline with 7x fewer parameters and substantially less training data through shared cross-attention and a three-part backbone.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That separating prompt alignment, temporal consistency, and fine-detail recovery into distinct architectural stages actually reduces interference and drives the observed gains, rather than the gains arising mainly from the dynamic token routing, early feature alignment, or other unstated aspects of the training recipe.","pith_extraction_headline":"Separating prompt alignment, temporal consistency, and fine-detail recovery into distinct stages lets a 2B video model outperform a 14B baseline on VBench with far less data and compute."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2604.16503/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"f5e34d16-5585-422d-a337-c37c353cf543"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T01:05:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"jrgeiQvD0d2E4Pp2H9TmZya+Wox5x8n89GOh3LOogMi5vsvc7SuXwnnCX/dxTU9iDbQ+WkHCjcngA5HqDmFODg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-03T01:32:21.602089Z"},"content_sha256":"eb1d759433b9a91cb020e29306cad77c02c4df8cd2690e52148d0809ecd66cad","schema_version":"1.0","event_id":"sha256:eb1d759433b9a91cb020e29306cad77c02c4df8cd2690e52148d0809ecd66cad"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/J2YFXZY33UMKM3KW4LC76TAFPL/bundle.json","state_url":"https://pith.science/pith/J2YFXZY33UMKM3KW4LC76TAFPL/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/J2YFXZY33UMKM3KW4LC76TAFPL/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-03T01:32:21Z","links":{"resolver":"https://pith.science/pith/J2YFXZY33UMKM3KW4LC76TAFPL","bundle":"https://pith.science/pith/J2YFXZY33UMKM3KW4LC76TAFPL/bundle.json","state":"https://pith.science/pith/J2YFXZY33UMKM3KW4LC76TAFPL/state.json","well_known_bundle":"https://pith.science/.well-known/pith/J2YFXZY33UMKM3KW4LC76TAFPL/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:J2YFXZY33UMKM3KW4LC76TAFPL","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"69298a6784daed110a9554e6e6b78272b8737d3308231f726b583112a9e3d994","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-04-14T15:09:39Z","title_canon_sha256":"6e8b79a904223c58498d3f57c7a29bfacc6c6429f415b10f39c0be4dac8db7b7"},"schema_version":"1.0","source":{"id":"2604.16503","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2604.16503","created_at":"2026-05-20T01:05:13Z"},{"alias_kind":"arxiv_version","alias_value":"2604.16503v2","created_at":"2026-05-20T01:05:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.16503","created_at":"2026-05-20T01:05:13Z"},{"alias_kind":"pith_short_12","alias_value":"J2YFXZY33UMK","created_at":"2026-05-20T01:05:13Z"},{"alias_kind":"pith_short_16","alias_value":"J2YFXZY33UMKM3KW","created_at":"2026-05-20T01:05:13Z"},{"alias_kind":"pith_short_8","alias_value":"J2YFXZY3","created_at":"2026-05-20T01:05:13Z"}],"graph_snapshots":[{"event_id":"sha256:eb1d759433b9a91cb020e29306cad77c02c4df8cd2690e52148d0809ecd66cad","target":"graph","created_at":"2026-05-20T01:05:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"On VBench, Motif-Video~2B reaches 83.76%, surpassing Wan2.1 14B while using 7× fewer parameters and substantially less training data."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That separating prompt alignment, temporal consistency, and fine-detail recovery into distinct architectural stages actually reduces interference and drives the observed gains, rather than the gains arising mainly from the dynamic token routing, early feature alignment, or other unstated aspects of the training recipe."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Motif-Video 2B achieves 83.76% VBench score, beating a 14B-parameter baseline with 7x fewer parameters and substantially less training data through shared cross-attention and a three-part backbone."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Separating prompt alignment, temporal consistency, and fine-detail recovery into distinct stages lets a 2B video model outperform a 14B baseline on VBench with far less data and compute."}],"snapshot_sha256":"eb9fab6ef52412df059910ebdd49605289c60590d1387d33e7d4b0bdbb7d533e"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2604.16503/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Training strong video generation models usually requires massive datasets, large parameter counts, and substantial compute. In this work, we ask whether strong text-to-video quality is possible at a much smaller budget: fewer than 10M clips and less than 100,000 H200 GPU hours. Our core claim is that part of the answer lies in how model capacity is organized, not only in how much of it is used. In video generation, prompt alignment, temporal consistency, and fine-detail recovery can interfere with one another when they are handled through the same pathway. Motif-Video 2B addresses this by sepa","authors_text":"Beomgyu Kim, Bokki Ryu, Changjin Kang, Dahye Choi, Dongjoo Weon, Dongpin Oh, Dongseok Kim, Eunhwan Park, Haesol Lee, Hanbin Jung, Hongjoo Lee, Hyeyeon Cho, Hyukjin Kweon, Jaeheui Her, Jaeyeon Huh, Jangwoong Kim, Jeesoo Lee, Jeongdoo Lee, Junghwan Lim, Junhyeok Lee, Minjae Kim, Minsu Ha, Sungmin Lee, Taehyun Kim, Taewhan Kim, Wai Ting Cheung, Yeongjae Park, Youngrok Kim","cross_cats":["cs.AI"],"headline":"Separating prompt alignment, temporal consistency, and fine-detail recovery into distinct stages lets a 2B video model outperform a 14B baseline on VBench with far less data and compute.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-04-14T15:09:39Z","title":"Motif-Video 2B: Technical Report"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2604.16503","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-10T15:46:21.654550Z","id":"f5e34d16-5585-422d-a337-c37c353cf543","model_set":{"reader":"grok-4.3"},"one_line_summary":"Motif-Video 2B achieves 83.76% VBench score, beating a 14B-parameter baseline with 7x fewer parameters and substantially less training data through shared cross-attention and a three-part backbone.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Separating prompt alignment, temporal consistency, and fine-detail recovery into distinct stages lets a 2B video model outperform a 14B baseline on VBench with far less data and compute.","strongest_claim":"On VBench, Motif-Video~2B reaches 83.76%, surpassing Wan2.1 14B while using 7× fewer parameters and substantially less training data.","weakest_assumption":"That separating prompt alignment, temporal consistency, and fine-detail recovery into distinct architectural stages actually reduces interference and drives the observed gains, rather than the gains arising mainly from the dynamic token routing, early feature alignment, or other unstated aspects of the training recipe."}},"verdict_id":"f5e34d16-5585-422d-a337-c37c353cf543"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:b71f500711578f35f0965455c478e9224c19447a76010163daa8a4a9658703fc","target":"record","created_at":"2026-05-20T01:05:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"69298a6784daed110a9554e6e6b78272b8737d3308231f726b583112a9e3d994","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-04-14T15:09:39Z","title_canon_sha256":"6e8b79a904223c58498d3f57c7a29bfacc6c6429f415b10f39c0be4dac8db7b7"},"schema_version":"1.0","source":{"id":"2604.16503","kind":"arxiv","version":2}},"canonical_sha256":"4eb05be71bdd18a66d56e2c5ff4c057ae5d5ebce652b2fd009f2812f8bdce389","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"4eb05be71bdd18a66d56e2c5ff4c057ae5d5ebce652b2fd009f2812f8bdce389","first_computed_at":"2026-05-20T01:05:13.743500Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T01:05:13.743500Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"7eIoNp5aeKvZr/GSYBRtbx5ZP2huQ6wUN12fEkT04JUnPH9bb0ALOnsrCC1aqTzeYcIzQ8cf1VmBL5uCZj/1Ag==","signature_status":"signed_v1","signed_at":"2026-05-20T01:05:13.744273Z","signed_message":"canonical_sha256_bytes"},"source_id":"2604.16503","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:b71f500711578f35f0965455c478e9224c19447a76010163daa8a4a9658703fc","sha256:eb1d759433b9a91cb020e29306cad77c02c4df8cd2690e52148d0809ecd66cad"],"state_sha256":"b33618b6e6d8810df3435028bcda6ef08152c2b8c91962e05e886fac33c5635c"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Q5el2yPpX+58cC5RjPZOowvPrTUEWShkSjrm4faQLqXVYKPgMXlGGvj5XnCvOfYxYkjJ5Jki00fyp9X/rWjxBQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-03T01:32:21.604475Z","bundle_sha256":"38057d688dac9fdef67ded9d6cc9f464b3c9088f9962a710e592119f55a96d22"}}