{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:3WUA2SMOA5IDJOY4KXU55G44FG","short_pith_number":"pith:3WUA2SMO","schema_version":"1.0","canonical_sha256":"dda80d498e075034bb1c55e9de9b9c29bf29f1e4ea51095f45f0a9c1c4e5e501","source":{"kind":"arxiv","id":"2606.24464","version":1},"attestation_state":"computed","paper":{"title":"Boosting Text-Driven Video Segmentation via Geometry-Aware Distillation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Hesong Li, Tianyu Zhu, Ying Fu, Yingping Liang","submitted_at":"2026-06-23T11:57:08Z","abstract_excerpt":"Text-driven Referring Video Object Segmentation (RVOS) aims to locate and segment target objects in videos given natural language. However, existing models are typically trained on 2D image or video datasets with naive segmentation losses, which overlooks the geometric consistency across frames and leads to weak spatial understanding. In this paper, we propose Geometry-enhanced Language-guided Video segmentation (GeoLaV), a two-stage framework that distills 3D geometric knowledge from images to enhance text-driven video segmentation. In the first stage, we perform monocular geometry pretrainin"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.24464","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-06-23T11:57:08Z","cross_cats_sorted":[],"title_canon_sha256":"68bc96f008b96a41e1e34b28d5f4808f43a634f8c4077f6716bff5a7852a2c0c","abstract_canon_sha256":"c9111315e0ed4b1352f496378b93ff1d15b190b12c222b54f8a38d2f73034224"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-24T01:15:31.095931Z","signature_b64":"ydabPmnXFXETd/mItvWBv2VPo1zJ2x8Iu1kgSoqctC3vGzRJDVqe/M9UVnIIanHT3YsCFwbgbtO6nnB20FTJCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"dda80d498e075034bb1c55e9de9b9c29bf29f1e4ea51095f45f0a9c1c4e5e501","last_reissued_at":"2026-06-24T01:15:31.095568Z","signature_status":"signed_v1","first_computed_at":"2026-06-24T01:15:31.095568Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Boosting Text-Driven Video Segmentation via Geometry-Aware Distillation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Hesong Li, Tianyu Zhu, Ying Fu, Yingping Liang","submitted_at":"2026-06-23T11:57:08Z","abstract_excerpt":"Text-driven Referring Video Object Segmentation (RVOS) aims to locate and segment target objects in videos given natural language. However, existing models are typically trained on 2D image or video datasets with naive segmentation losses, which overlooks the geometric consistency across frames and leads to weak spatial understanding. In this paper, we propose Geometry-enhanced Language-guided Video segmentation (GeoLaV), a two-stage framework that distills 3D geometric knowledge from images to enhance text-driven video segmentation. In the first stage, we perform monocular geometry pretrainin"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.24464","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.24464/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.24464","created_at":"2026-06-24T01:15:31.095639+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.24464v1","created_at":"2026-06-24T01:15:31.095639+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.24464","created_at":"2026-06-24T01:15:31.095639+00:00"},{"alias_kind":"pith_short_12","alias_value":"3WUA2SMOA5ID","created_at":"2026-06-24T01:15:31.095639+00:00"},{"alias_kind":"pith_short_16","alias_value":"3WUA2SMOA5IDJOY4","created_at":"2026-06-24T01:15:31.095639+00:00"},{"alias_kind":"pith_short_8","alias_value":"3WUA2SMO","created_at":"2026-06-24T01:15:31.095639+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/3WUA2SMOA5IDJOY4KXU55G44FG","json":"https://pith.science/pith/3WUA2SMOA5IDJOY4KXU55G44FG.json","graph_json":"https://pith.science/api/pith-number/3WUA2SMOA5IDJOY4KXU55G44FG/graph.json","events_json":"https://pith.science/api/pith-number/3WUA2SMOA5IDJOY4KXU55G44FG/events.json","paper":"https://pith.science/paper/3WUA2SMO"},"agent_actions":{"view_html":"https://pith.science/pith/3WUA2SMOA5IDJOY4KXU55G44FG","download_json":"https://pith.science/pith/3WUA2SMOA5IDJOY4KXU55G44FG.json","view_paper":"https://pith.science/paper/3WUA2SMO","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.24464&json=true","fetch_graph":"https://pith.science/api/pith-number/3WUA2SMOA5IDJOY4KXU55G44FG/graph.json","fetch_events":"https://pith.science/api/pith-number/3WUA2SMOA5IDJOY4KXU55G44FG/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/3WUA2SMOA5IDJOY4KXU55G44FG/action/timestamp_anchor","attest_storage":"https://pith.science/pith/3WUA2SMOA5IDJOY4KXU55G44FG/action/storage_attestation","attest_author":"https://pith.science/pith/3WUA2SMOA5IDJOY4KXU55G44FG/action/author_attestation","sign_citation":"https://pith.science/pith/3WUA2SMOA5IDJOY4KXU55G44FG/action/citation_signature","submit_replication":"https://pith.science/pith/3WUA2SMOA5IDJOY4KXU55G44FG/action/replication_record"}},"created_at":"2026-06-24T01:15:31.095639+00:00","updated_at":"2026-06-24T01:15:31.095639+00:00"}