{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:XJXDZDJIMCDXDCGGBCICCL6MRN","short_pith_number":"pith:XJXDZDJI","schema_version":"1.0","canonical_sha256":"ba6e3c8d2860877188c60890212fcc8b4e61374b1fc8caa39ba16ed65681c526","source":{"kind":"arxiv","id":"2605.16903","version":1},"attestation_state":"computed","paper":{"title":"WOW-Seg: A Word-free Open World Segmentation Model","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A word-free model segments and recognizes open-world objects by aligning visual masks directly to vision-language features.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Bin Li, Danyang Li, Ming-Ming Cheng, Tianhao Wu, Xiang Li, Yang Zhang, Yuxuan Li, Zhenyuan Chen","submitted_at":"2026-05-16T09:28:46Z","abstract_excerpt":"Open world image segmentation aims to achieve precise segmentation and semantic understanding of targets within images by addressing the infinitely open set of object categories encountered in the real world. However, traditional closed-set segmentation approaches struggle to adapt to complex open world scenarios, while foundation segmentation models such as SAM exhibit notable discrepancies between their strong segmentation capabilities and relatively weaker semantic understanding. To bridge these discrepancies, we propose WOW-Seg, a Word-free Open World Segmentation model for segmenting and "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2605.16903","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-16T09:28:46Z","cross_cats_sorted":[],"title_canon_sha256":"c8d205046de8d48040a3f1624b4e2005008cc3a1fbd429f0f8c489012a69e057","abstract_canon_sha256":"9de3ce4401b6ea8db64491f6687e85c7bc99cf9973d46c58cf02214ca9fdc1dc"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:03:29.321152Z","signature_b64":"ZqNZMsMg+htAA0q62n7nsOYLkjCsmscdopKKbJ6Fi8l/heAZlMXblMm4qc3R3LhPMS3BxOEM8djT9YUxUTMGBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ba6e3c8d2860877188c60890212fcc8b4e61374b1fc8caa39ba16ed65681c526","last_reissued_at":"2026-05-20T00:03:29.320227Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:03:29.320227Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"WOW-Seg: A Word-free Open World Segmentation Model","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A word-free model segments and recognizes open-world objects by aligning visual masks directly to vision-language features.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Bin Li, Danyang Li, Ming-Ming Cheng, Tianhao Wu, Xiang Li, Yang Zhang, Yuxuan Li, Zhenyuan Chen","submitted_at":"2026-05-16T09:28:46Z","abstract_excerpt":"Open world image segmentation aims to achieve precise segmentation and semantic understanding of targets within images by addressing the infinitely open set of object categories encountered in the real world. However, traditional closed-set segmentation approaches struggle to adapt to complex open world scenarios, while foundation segmentation models such as SAM exhibit notable discrepancies between their strong segmentation capabilities and relatively weaker semantic understanding. To bridge these discrepancies, we propose WOW-Seg, a Word-free Open World Segmentation model for segmenting and "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"WOW-Seg attains strong results on the LVIS dataset, achieving a semantic similarity of 89.7 and a semantic IoU of 82.4. This performance surpasses the previous SOTA while using only one-eighth the parameter count.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The Mask2Token module successfully aligns visual mask tokens with the VLLM feature space in a way that supports open-set recognition without any text supervision or category-specific training data.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"WOW-Seg proposes a word-free open-world segmentation model using Mask2Token and Cascade Attention Mask modules, reporting 89.7 semantic similarity and 82.4 semantic IoU on LVIS with one-eighth the parameters of prior SOTA plus a new 7,662-class benchmark.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A word-free model segments and recognizes open-world objects by aligning visual masks directly to vision-language features.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"3d72a1fcc01d36e62bf40d8eef1f6b252d61d13914c806bc0ff8a03eb7634411"},"source":{"id":"2605.16903","kind":"arxiv","version":1},"verdict":{"id":"69f48347-3017-4d97-9f30-9da28a52fe10","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-19T21:20:00.129681Z","strongest_claim":"WOW-Seg attains strong results on the LVIS dataset, achieving a semantic similarity of 89.7 and a semantic IoU of 82.4. This performance surpasses the previous SOTA while using only one-eighth the parameter count.","one_line_summary":"WOW-Seg proposes a word-free open-world segmentation model using Mask2Token and Cascade Attention Mask modules, reporting 89.7 semantic similarity and 82.4 semantic IoU on LVIS with one-eighth the parameters of prior SOTA plus a new 7,662-class benchmark.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The Mask2Token module successfully aligns visual mask tokens with the VLLM feature space in a way that supports open-set recognition without any text supervision or category-specific training data.","pith_extraction_headline":"A word-free model segments and recognizes open-world objects by aligning visual masks directly to vision-language features."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.16903/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"doi_title_agreement","ran_at":"2026-05-19T21:31:19.175798Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T21:30:53.083186Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"cited_work_retraction","ran_at":"2026-05-19T20:52:22.531980Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"claim_evidence","ran_at":"2026-05-19T18:41:56.275065Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"ai_meta_artifact","ran_at":"2026-05-19T18:33:26.354272Z","status":"skipped","version":"1.0.0","findings_count":0}],"snapshot_sha256":"e4c04c460bf17d16d2d5a89a72c057cc967d17284cc674f3ac4f9cbdf1674eab"},"references":{"count":32,"sample":[{"doi":"","year":null,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","ref_index":1,"cited_arxiv_id":"2502.13923","is_internal_anchor":true},{"doi":"","year":null,"title":"Shikra: Unleashing Multimodal LLM's Referential Dialogue Magic","work_id":"44525076-312a-4259-b79c-134cd7eeb297","ref_index":2,"cited_arxiv_id":"2306.15195","is_internal_anchor":true},{"doi":"","year":null,"title":"Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling","work_id":"ee70bdc8-4656-4849-ada7-ce42a2278d70","ref_index":3,"cited_arxiv_id":"2412.05271","is_internal_anchor":true},{"doi":"","year":2026,"title":"Imagenet: A large-scale hi- erarchical image database","work_id":"0fb101e8-edf8-42ad-84e9-6617d6f6e1db","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Tag: Guidance-free open-vocabulary semantic segmenta- tion.arXiv preprint arXiv:2403.11197,","work_id":"c78f83c4-c2ad-4090-b9ec-ed9b31fbdb24","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":32,"snapshot_sha256":"90facedebaa982d24d78e963f65997547cd860c80d53d4123a90665fc7e39d79","internal_anchors":12},"formal_canon":{"evidence_count":2,"snapshot_sha256":"58a31548d4609412e852fbb710521e18da8a1c8af88bd3d0a96cbdd9f0098be2"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.16903","created_at":"2026-05-20T00:03:29.320401+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.16903v1","created_at":"2026-05-20T00:03:29.320401+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.16903","created_at":"2026-05-20T00:03:29.320401+00:00"},{"alias_kind":"pith_short_12","alias_value":"XJXDZDJIMCDX","created_at":"2026-05-20T00:03:29.320401+00:00"},{"alias_kind":"pith_short_16","alias_value":"XJXDZDJIMCDXDCGG","created_at":"2026-05-20T00:03:29.320401+00:00"},{"alias_kind":"pith_short_8","alias_value":"XJXDZDJI","created_at":"2026-05-20T00:03:29.320401+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/XJXDZDJIMCDXDCGGBCICCL6MRN","json":"https://pith.science/pith/XJXDZDJIMCDXDCGGBCICCL6MRN.json","graph_json":"https://pith.science/api/pith-number/XJXDZDJIMCDXDCGGBCICCL6MRN/graph.json","events_json":"https://pith.science/api/pith-number/XJXDZDJIMCDXDCGGBCICCL6MRN/events.json","paper":"https://pith.science/paper/XJXDZDJI"},"agent_actions":{"view_html":"https://pith.science/pith/XJXDZDJIMCDXDCGGBCICCL6MRN","download_json":"https://pith.science/pith/XJXDZDJIMCDXDCGGBCICCL6MRN.json","view_paper":"https://pith.science/paper/XJXDZDJI","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.16903&json=true","fetch_graph":"https://pith.science/api/pith-number/XJXDZDJIMCDXDCGGBCICCL6MRN/graph.json","fetch_events":"https://pith.science/api/pith-number/XJXDZDJIMCDXDCGGBCICCL6MRN/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/XJXDZDJIMCDXDCGGBCICCL6MRN/action/timestamp_anchor","attest_storage":"https://pith.science/pith/XJXDZDJIMCDXDCGGBCICCL6MRN/action/storage_attestation","attest_author":"https://pith.science/pith/XJXDZDJIMCDXDCGGBCICCL6MRN/action/author_attestation","sign_citation":"https://pith.science/pith/XJXDZDJIMCDXDCGGBCICCL6MRN/action/citation_signature","submit_replication":"https://pith.science/pith/XJXDZDJIMCDXDCGGBCICCL6MRN/action/replication_record"}},"created_at":"2026-05-20T00:03:29.320401+00:00","updated_at":"2026-05-20T00:03:29.320401+00:00"}