{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:7RWQBRQ5PEDT377TLKLPZVPJ5K","short_pith_number":"pith:7RWQBRQ5","schema_version":"1.0","canonical_sha256":"fc6d00c61d79073dfff35a96fcd5e9ea89b24a4c5870cc4e2653680249e5b583","source":{"kind":"arxiv","id":"1809.03327","version":1},"attestation_state":"computed","paper":{"title":"YouTube-VOS: A Large-Scale Video Object Segmentation Benchmark","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Dingcheng Yue, Jianchao Yang, Linjie Yang, Ning Xu, Thomas Huang, Yuchen Fan, Yuchen Liang","submitted_at":"2018-09-06T04:19:45Z","abstract_excerpt":"Learning long-term spatial-temporal features are critical for many video analysis tasks. However, existing video segmentation methods predominantly rely on static image segmentation techniques, and methods capturing temporal dependency for segmentation have to depend on pretrained optical flow models, leading to suboptimal solutions for the problem. End-to-end sequential learning to explore spatialtemporal features for video segmentation is largely limited by the scale of available video segmentation datasets, i.e., even the largest video segmentation dataset only contains 90 short video clips"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1809.03327","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2018-09-06T04:19:45Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"228744cc4d6d727481f09c578da60c8231858ff051b7a6c6c3500fa07499de28","abstract_canon_sha256":"441b1bfc4cf718271e2a64b92c21db716e766e1b8041318c035f40cacd1bd9d8"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:06:07.183926Z","signature_b64":"Vlj5Xxor/HhfXzY5aSW8qoastt/xsvQvo04RlvoKhY9hiIw5ymh55dwOBQErfYx2vlT84Bf6zvCZDjQPJ/llAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"fc6d00c61d79073dfff35a96fcd5e9ea89b24a4c5870cc4e2653680249e5b583","last_reissued_at":"2026-05-18T00:06:07.183465Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:06:07.183465Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"YouTube-VOS: A Large-Scale Video Object Segmentation Benchmark","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Dingcheng Yue, Jianchao Yang, Linjie Yang, Ning Xu, Thomas Huang, Yuchen Fan, Yuchen Liang","submitted_at":"2018-09-06T04:19:45Z","abstract_excerpt":"Learning long-term spatial-temporal features are critical for many video analysis tasks. However, existing video segmentation methods predominantly rely on static image segmentation techniques, and methods capturing temporal dependency for segmentation have to depend on pretrained optical flow models, leading to suboptimal solutions for the problem. End-to-end sequential learning to explore spatialtemporal features for video segmentation is largely limited by the scale of available video segmentation datasets, i.e., even the largest video segmentation dataset only contains 90 short video clips"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1809.03327","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1809.03327","created_at":"2026-05-18T00:06:07.183546+00:00"},{"alias_kind":"arxiv_version","alias_value":"1809.03327v1","created_at":"2026-05-18T00:06:07.183546+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1809.03327","created_at":"2026-05-18T00:06:07.183546+00:00"},{"alias_kind":"pith_short_12","alias_value":"7RWQBRQ5PEDT","created_at":"2026-05-18T12:32:11.075285+00:00"},{"alias_kind":"pith_short_16","alias_value":"7RWQBRQ5PEDT377T","created_at":"2026-05-18T12:32:11.075285+00:00"},{"alias_kind":"pith_short_8","alias_value":"7RWQBRQ5","created_at":"2026-05-18T12:32:11.075285+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":17,"internal_anchor_count":8,"sample":[{"citing_arxiv_id":"2504.12169","citing_title":"Towards a General-Purpose Zero-Shot Synthetic Low-Light Image and Video Pipeline","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2510.18822","citing_title":"SAM 2++: Tracking Anything at Any Granularity","ref_index":60,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18010","citing_title":"Functionalization via Structure Completion and Motion Rectification","ref_index":281,"is_internal_anchor":true},{"citing_arxiv_id":"2506.05425","citing_title":"SIV-Bench: A Video Benchmark for Social Interaction Understanding and Reasoning","ref_index":54,"is_internal_anchor":true},{"citing_arxiv_id":"2511.16719","citing_title":"SAM 3: Segment Anything with Concepts","ref_index":145,"is_internal_anchor":true},{"citing_arxiv_id":"2512.13684","citing_title":"Recurrent Video Masked Autoencoders","ref_index":75,"is_internal_anchor":true},{"citing_arxiv_id":"2512.22046","citing_title":"Backdoor Attacks on Prompt-Driven Video Segmentation Foundation Models","ref_index":57,"is_internal_anchor":true},{"citing_arxiv_id":"2601.08831","citing_title":"3AM: 3egment Anything with Geometric Consistency in Videos","ref_index":92,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12006","citing_title":"Robust Promptable Video Object Segmentation","ref_index":47,"is_internal_anchor":false},{"citing_arxiv_id":"2604.27322","citing_title":"YOSE: You Only Select Essential Tokens for Efficient DiT-based Video Object Removal","ref_index":23,"is_internal_anchor":false},{"citing_arxiv_id":"2604.26488","citing_title":"Featurising Pixels from Dynamic 3D Scenes with Linear In-Context Learners","ref_index":44,"is_internal_anchor":false},{"citing_arxiv_id":"2605.03276","citing_title":"VEBench:Benchmarking Large Multimodal Models for Real-World Video Editing","ref_index":38,"is_internal_anchor":false},{"citing_arxiv_id":"2605.03276","citing_title":"VEBench:Benchmarking Large Multimodal Models for Real-World Video Editing","ref_index":38,"is_internal_anchor":false},{"citing_arxiv_id":"2605.00891","citing_title":"X2SAM: Any Segmentation in Images and Videos","ref_index":42,"is_internal_anchor":false},{"citing_arxiv_id":"2604.14630","citing_title":"CMTM: Cross-Modal Token Modulation for Unsupervised Video Object Segmentation","ref_index":23,"is_internal_anchor":false},{"citing_arxiv_id":"2604.07901","citing_title":"PanoSAM2: Lightweight Distortion- and Memory-aware Adaptions of SAM2 for 360 Video Object Segmentation","ref_index":36,"is_internal_anchor":false},{"citing_arxiv_id":"2408.00714","citing_title":"SAM 2: Segment Anything in Images and Videos","ref_index":29,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/7RWQBRQ5PEDT377TLKLPZVPJ5K","json":"https://pith.science/pith/7RWQBRQ5PEDT377TLKLPZVPJ5K.json","graph_json":"https://pith.science/api/pith-number/7RWQBRQ5PEDT377TLKLPZVPJ5K/graph.json","events_json":"https://pith.science/api/pith-number/7RWQBRQ5PEDT377TLKLPZVPJ5K/events.json","paper":"https://pith.science/paper/7RWQBRQ5"},"agent_actions":{"view_html":"https://pith.science/pith/7RWQBRQ5PEDT377TLKLPZVPJ5K","download_json":"https://pith.science/pith/7RWQBRQ5PEDT377TLKLPZVPJ5K.json","view_paper":"https://pith.science/paper/7RWQBRQ5","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1809.03327&json=true","fetch_graph":"https://pith.science/api/pith-number/7RWQBRQ5PEDT377TLKLPZVPJ5K/graph.json","fetch_events":"https://pith.science/api/pith-number/7RWQBRQ5PEDT377TLKLPZVPJ5K/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/7RWQBRQ5PEDT377TLKLPZVPJ5K/action/timestamp_anchor","attest_storage":"https://pith.science/pith/7RWQBRQ5PEDT377TLKLPZVPJ5K/action/storage_attestation","attest_author":"https://pith.science/pith/7RWQBRQ5PEDT377TLKLPZVPJ5K/action/author_attestation","sign_citation":"https://pith.science/pith/7RWQBRQ5PEDT377TLKLPZVPJ5K/action/citation_signature","submit_replication":"https://pith.science/pith/7RWQBRQ5PEDT377TLKLPZVPJ5K/action/replication_record"}},"created_at":"2026-05-18T00:06:07.183546+00:00","updated_at":"2026-05-18T00:06:07.183546+00:00"}