{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2014:LOAS77FFXHK6VG5R6CX7NP5VU2","short_pith_number":"pith:LOAS77FF","schema_version":"1.0","canonical_sha256":"5b812ffca5b9d5ea9bb1f0aff6bfb5a6a2f6fd1c8c49bcb908d4a164999cd514","source":{"kind":"arxiv","id":"1412.6604","version":5},"attestation_state":"computed","paper":{"title":"Video (language) modeling: a baseline for generative models of natural videos","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV"],"primary_cat":"cs.LG","authors_text":"Arthur Szlam, Joan Bruna, MarcAurelio Ranzato, Michael Mathieu, Ronan Collobert, Sumit Chopra","submitted_at":"2014-12-20T05:05:51Z","abstract_excerpt":"We propose a strong baseline model for unsupervised feature learning using video data. By learning to predict missing frames or extrapolate future frames from an input video sequence, the model discovers both spatial and temporal correlations which are useful to represent complex deformations and motion patterns. The models we propose are largely borrowed from the language modeling literature, and adapted to the vision domain by quantizing the space of image patches into a large dictionary. We demonstrate the approach on both a filling and a generation task. For the first time, we show that, a"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1412.6604","kind":"arxiv","version":5},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2014-12-20T05:05:51Z","cross_cats_sorted":["cs.CV"],"title_canon_sha256":"cfdcf905a177d291430dde59fb4773751d532852acecd41bc6b8132d928f4f0a","abstract_canon_sha256":"bf416b97ed660c9f1629e3bbd4152299e433468e6a9a51d47bea4f06a11281a0"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T01:15:38.854290Z","signature_b64":"OPWeDuOZWBkoLH+h7Tmquk9LBmHQQBlZft6j/fy8mlkvFqalaaRotyV5m9hyFBECM/Z8tH/iQRwc+En7KCS8CQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5b812ffca5b9d5ea9bb1f0aff6bfb5a6a2f6fd1c8c49bcb908d4a164999cd514","last_reissued_at":"2026-05-18T01:15:38.853666Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T01:15:38.853666Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Video (language) modeling: a baseline for generative models of natural videos","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CV"],"primary_cat":"cs.LG","authors_text":"Arthur Szlam, Joan Bruna, MarcAurelio Ranzato, Michael Mathieu, Ronan Collobert, Sumit Chopra","submitted_at":"2014-12-20T05:05:51Z","abstract_excerpt":"We propose a strong baseline model for unsupervised feature learning using video data. By learning to predict missing frames or extrapolate future frames from an input video sequence, the model discovers both spatial and temporal correlations which are useful to represent complex deformations and motion patterns. The models we propose are largely borrowed from the language modeling literature, and adapted to the vision domain by quantizing the space of image patches into a large dictionary. We demonstrate the approach on both a filling and a generation task. For the first time, we show that, a"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1412.6604","kind":"arxiv","version":5},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1412.6604","created_at":"2026-05-18T01:15:38.853757+00:00"},{"alias_kind":"arxiv_version","alias_value":"1412.6604v5","created_at":"2026-05-18T01:15:38.853757+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1412.6604","created_at":"2026-05-18T01:15:38.853757+00:00"},{"alias_kind":"pith_short_12","alias_value":"LOAS77FFXHK6","created_at":"2026-05-18T12:28:38.356838+00:00"},{"alias_kind":"pith_short_16","alias_value":"LOAS77FFXHK6VG5R","created_at":"2026-05-18T12:28:38.356838+00:00"},{"alias_kind":"pith_short_8","alias_value":"LOAS77FF","created_at":"2026-05-18T12:28:38.356838+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":6,"internal_anchor_count":5,"sample":[{"citing_arxiv_id":"2304.11193","citing_title":"Multi-Modal World Model for Physical Robot Interactions: Simultaneous Visual and Tactile Predictions for Enhanced Accuracy","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2507.13942","citing_title":"Frozen Forecasting: A Unified Evaluation","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2210.02399","citing_title":"Phenaki: Variable Length Video Generation From Open Domain Textual Description","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2508.00795","citing_title":"Video Generators are Robot Policies","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2211.11018","citing_title":"MagicVideo: Efficient Video Generation With Latent Diffusion Models","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2210.02303","citing_title":"Imagen Video: High Definition Video Generation with Diffusion Models","ref_index":15,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/LOAS77FFXHK6VG5R6CX7NP5VU2","json":"https://pith.science/pith/LOAS77FFXHK6VG5R6CX7NP5VU2.json","graph_json":"https://pith.science/api/pith-number/LOAS77FFXHK6VG5R6CX7NP5VU2/graph.json","events_json":"https://pith.science/api/pith-number/LOAS77FFXHK6VG5R6CX7NP5VU2/events.json","paper":"https://pith.science/paper/LOAS77FF"},"agent_actions":{"view_html":"https://pith.science/pith/LOAS77FFXHK6VG5R6CX7NP5VU2","download_json":"https://pith.science/pith/LOAS77FFXHK6VG5R6CX7NP5VU2.json","view_paper":"https://pith.science/paper/LOAS77FF","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1412.6604&json=true","fetch_graph":"https://pith.science/api/pith-number/LOAS77FFXHK6VG5R6CX7NP5VU2/graph.json","fetch_events":"https://pith.science/api/pith-number/LOAS77FFXHK6VG5R6CX7NP5VU2/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/LOAS77FFXHK6VG5R6CX7NP5VU2/action/timestamp_anchor","attest_storage":"https://pith.science/pith/LOAS77FFXHK6VG5R6CX7NP5VU2/action/storage_attestation","attest_author":"https://pith.science/pith/LOAS77FFXHK6VG5R6CX7NP5VU2/action/author_attestation","sign_citation":"https://pith.science/pith/LOAS77FFXHK6VG5R6CX7NP5VU2/action/citation_signature","submit_replication":"https://pith.science/pith/LOAS77FFXHK6VG5R6CX7NP5VU2/action/replication_record"}},"created_at":"2026-05-18T01:15:38.853757+00:00","updated_at":"2026-05-18T01:15:38.853757+00:00"}