{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:3PFFSBOTJ4WIK6L4A573HEVZ4U","short_pith_number":"pith:3PFFSBOT","canonical_record":{"source":{"id":"2605.12953","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T03:36:44Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"1d638688c214b9c910920670d63eb82ade729e2881007a82a725e6b85d45e068","abstract_canon_sha256":"c6de3220d38c5bb0bb238dab329644b42a90c09b9802db94b99b69a3e9d5c74b"},"schema_version":"1.0"},"canonical_sha256":"dbca5905d34f2c85797c077fb392b9e50a8706ed252b6fb9ae609463ec214b1f","source":{"kind":"arxiv","id":"2605.12953","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.12953","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"arxiv_version","alias_value":"2605.12953v1","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.12953","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"pith_short_12","alias_value":"3PFFSBOTJ4WI","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"3PFFSBOTJ4WIK6L4","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"3PFFSBOT","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:3PFFSBOTJ4WIK6L4A573HEVZ4U","target":"record","payload":{"canonical_record":{"source":{"id":"2605.12953","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T03:36:44Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"1d638688c214b9c910920670d63eb82ade729e2881007a82a725e6b85d45e068","abstract_canon_sha256":"c6de3220d38c5bb0bb238dab329644b42a90c09b9802db94b99b69a3e9d5c74b"},"schema_version":"1.0"},"canonical_sha256":"dbca5905d34f2c85797c077fb392b9e50a8706ed252b6fb9ae609463ec214b1f","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:09:09.360203Z","signature_b64":"oFqpurGJMzLKy4QvXn22wpq/EO8onozRsz4dFsQbvkjAPPvon40AOj56xCNRmKEH1JPtTqicZQOzQ1HHfQqHBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"dbca5905d34f2c85797c077fb392b9e50a8706ed252b6fb9ae609463ec214b1f","last_reissued_at":"2026-05-18T03:09:09.359460Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:09:09.359460Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.12953","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:09:09Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"7yehQipnoaRzenuCjr1cWLaVCVYqrQb+eYWwxJM2/TFPlVusKglblZHWF79Wwb/0x4UH6FQh1V7iw+BFOH9+BA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T00:38:28.911104Z"},"content_sha256":"d1d3f648eee89386ab7e199f84168aa2aa922848555a57cdca932204f8d685aa","schema_version":"1.0","event_id":"sha256:d1d3f648eee89386ab7e199f84168aa2aa922848555a57cdca932204f8d685aa"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:3PFFSBOTJ4WIK6L4A573HEVZ4U","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Seg-Agent: Test-Time Multimodal Reasoning for Training-Free Language-Guided Segmentation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Seg-Agent lets off-the-shelf multimodal LLMs segment images from language instructions by running an iterative visual reasoning loop over marked regions on the image itself.","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Chao Hao, Guangcong Wang, Ji Du, Jun Xu, Shuo Ye, Xiaodong Cun, Xubin Zheng, Zitong Yu, Ziyue Qiao","submitted_at":"2026-05-13T03:36:44Z","abstract_excerpt":"Language-guided segmentation transcends the scope limitations of traditional semantic segmentation, enabling models to segment arbitrary target regions based on natural language instructions. Existing approaches typically adopt a two-stage framework: employing Multimodal Large Language Models (MLLMs) to interpret instructions and generate visual prompts, followed by foundational segmentation models (e.g., SAM) to produce masks. However, due to the limited spatial grounding capabilities of off-the-shelf MLLMs, these methods often rely on extensive training on large-scale datasets to achieve sat"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"This explicit multimodal interaction enables Seg-Agent to achieve performance comparable to state-of-the-art training-based methods without any parameter updates.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That off-the-shelf MLLMs, when given Set-of-Mark visual prompts, can reliably perform spatial selection and refinement in the visual domain without any fine-tuning or additional training data.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Seg-Agent performs language-guided segmentation without training by using Set-of-Mark visual prompts to enable explicit multimodal chain-of-reasoning in three stages: generation, selection, and refinement.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Seg-Agent lets off-the-shelf multimodal LLMs segment images from language instructions by running an iterative visual reasoning loop over marked regions on the image itself.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"e43d6d4f01a0a21b1f955e29050a4ebec30c9868d6c0e61787597a45decb864c"},"source":{"id":"2605.12953","kind":"arxiv","version":1},"verdict":{"id":"20bb19b1-02b7-4844-b64c-50254926af17","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T19:44:19.960359Z","strongest_claim":"This explicit multimodal interaction enables Seg-Agent to achieve performance comparable to state-of-the-art training-based methods without any parameter updates.","one_line_summary":"Seg-Agent performs language-guided segmentation without training by using Set-of-Mark visual prompts to enable explicit multimodal chain-of-reasoning in three stages: generation, selection, and refinement.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That off-the-shelf MLLMs, when given Set-of-Mark visual prompts, can reliably perform spatial selection and refinement in the visual domain without any fine-tuning or additional training data.","pith_extraction_headline":"Seg-Agent lets off-the-shelf multimodal LLMs segment images from language instructions by running an iterative visual reasoning loop over marked regions on the image itself."},"references":{"count":53,"sample":[{"doi":"","year":2023,"title":"Visual instruction tuning.Advances in neural infor- mation processing systems, 36:34892–34916, 2023","work_id":"4d50f42f-b0f0-43d8-9d98-2017bc6af231","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","ref_index":2,"cited_arxiv_id":"2303.08774","is_internal_anchor":true},{"doi":"","year":2025,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","ref_index":3,"cited_arxiv_id":"2502.13923","is_internal_anchor":true},{"doi":"","year":2021,"title":"Schwing, and Alexander Kirillov","work_id":"623c3651-4a96-423a-878c-50f851fbf767","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Schwing, Alexander Kirillov, and Rohit Girdhar","work_id":"1ff86880-a984-46c2-9df9-6ca32a66dc1a","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":53,"snapshot_sha256":"d33af0c2cb3c53aa053fdd202d6902e35438b1b62faca8024004ecc45f9f7875","internal_anchors":10},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"20bb19b1-02b7-4844-b64c-50254926af17"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:09:09Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"z1CU2qUvo1rwgfI6UfjQOyWqCwJ581qy8TiKrjqCRX4oqBhbroh+RD9nRP5UQmKCtwY4HLJM/ClysjA2OrkPAQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T00:38:28.912249Z"},"content_sha256":"ab69070ffe65175de4521b2b1be546103cab49999a935f795581362f08551b66","schema_version":"1.0","event_id":"sha256:ab69070ffe65175de4521b2b1be546103cab49999a935f795581362f08551b66"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/3PFFSBOTJ4WIK6L4A573HEVZ4U/bundle.json","state_url":"https://pith.science/pith/3PFFSBOTJ4WIK6L4A573HEVZ4U/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/3PFFSBOTJ4WIK6L4A573HEVZ4U/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T00:38:28Z","links":{"resolver":"https://pith.science/pith/3PFFSBOTJ4WIK6L4A573HEVZ4U","bundle":"https://pith.science/pith/3PFFSBOTJ4WIK6L4A573HEVZ4U/bundle.json","state":"https://pith.science/pith/3PFFSBOTJ4WIK6L4A573HEVZ4U/state.json","well_known_bundle":"https://pith.science/.well-known/pith/3PFFSBOTJ4WIK6L4A573HEVZ4U/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:3PFFSBOTJ4WIK6L4A573HEVZ4U","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"c6de3220d38c5bb0bb238dab329644b42a90c09b9802db94b99b69a3e9d5c74b","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T03:36:44Z","title_canon_sha256":"1d638688c214b9c910920670d63eb82ade729e2881007a82a725e6b85d45e068"},"schema_version":"1.0","source":{"id":"2605.12953","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.12953","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"arxiv_version","alias_value":"2605.12953v1","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.12953","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"pith_short_12","alias_value":"3PFFSBOTJ4WI","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"3PFFSBOTJ4WIK6L4","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"3PFFSBOT","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:ab69070ffe65175de4521b2b1be546103cab49999a935f795581362f08551b66","target":"graph","created_at":"2026-05-18T03:09:09Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"This explicit multimodal interaction enables Seg-Agent to achieve performance comparable to state-of-the-art training-based methods without any parameter updates."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That off-the-shelf MLLMs, when given Set-of-Mark visual prompts, can reliably perform spatial selection and refinement in the visual domain without any fine-tuning or additional training data."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Seg-Agent performs language-guided segmentation without training by using Set-of-Mark visual prompts to enable explicit multimodal chain-of-reasoning in three stages: generation, selection, and refinement."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Seg-Agent lets off-the-shelf multimodal LLMs segment images from language instructions by running an iterative visual reasoning loop over marked regions on the image itself."}],"snapshot_sha256":"e43d6d4f01a0a21b1f955e29050a4ebec30c9868d6c0e61787597a45decb864c"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Language-guided segmentation transcends the scope limitations of traditional semantic segmentation, enabling models to segment arbitrary target regions based on natural language instructions. Existing approaches typically adopt a two-stage framework: employing Multimodal Large Language Models (MLLMs) to interpret instructions and generate visual prompts, followed by foundational segmentation models (e.g., SAM) to produce masks. However, due to the limited spatial grounding capabilities of off-the-shelf MLLMs, these methods often rely on extensive training on large-scale datasets to achieve sat","authors_text":"Chao Hao, Guangcong Wang, Ji Du, Jun Xu, Shuo Ye, Xiaodong Cun, Xubin Zheng, Zitong Yu, Ziyue Qiao","cross_cats":["cs.AI"],"headline":"Seg-Agent lets off-the-shelf multimodal LLMs segment images from language instructions by running an iterative visual reasoning loop over marked regions on the image itself.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T03:36:44Z","title":"Seg-Agent: Test-Time Multimodal Reasoning for Training-Free Language-Guided Segmentation"},"references":{"count":53,"internal_anchors":10,"resolved_work":53,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Visual instruction tuning.Advances in neural infor- mation processing systems, 36:34892–34916, 2023","work_id":"4d50f42f-b0f0-43d8-9d98-2017bc6af231","year":2023},{"cited_arxiv_id":"2303.08774","doi":"","is_internal_anchor":true,"ref_index":2,"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","year":2023},{"cited_arxiv_id":"2502.13923","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Schwing, and Alexander Kirillov","work_id":"623c3651-4a96-423a-878c-50f851fbf767","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Schwing, Alexander Kirillov, and Rohit Girdhar","work_id":"1ff86880-a984-46c2-9df9-6ca32a66dc1a","year":2022}],"snapshot_sha256":"d33af0c2cb3c53aa053fdd202d6902e35438b1b62faca8024004ecc45f9f7875"},"source":{"id":"2605.12953","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T19:44:19.960359Z","id":"20bb19b1-02b7-4844-b64c-50254926af17","model_set":{"reader":"grok-4.3"},"one_line_summary":"Seg-Agent performs language-guided segmentation without training by using Set-of-Mark visual prompts to enable explicit multimodal chain-of-reasoning in three stages: generation, selection, and refinement.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Seg-Agent lets off-the-shelf multimodal LLMs segment images from language instructions by running an iterative visual reasoning loop over marked regions on the image itself.","strongest_claim":"This explicit multimodal interaction enables Seg-Agent to achieve performance comparable to state-of-the-art training-based methods without any parameter updates.","weakest_assumption":"That off-the-shelf MLLMs, when given Set-of-Mark visual prompts, can reliably perform spatial selection and refinement in the visual domain without any fine-tuning or additional training data."}},"verdict_id":"20bb19b1-02b7-4844-b64c-50254926af17"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:d1d3f648eee89386ab7e199f84168aa2aa922848555a57cdca932204f8d685aa","target":"record","created_at":"2026-05-18T03:09:09Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"c6de3220d38c5bb0bb238dab329644b42a90c09b9802db94b99b69a3e9d5c74b","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T03:36:44Z","title_canon_sha256":"1d638688c214b9c910920670d63eb82ade729e2881007a82a725e6b85d45e068"},"schema_version":"1.0","source":{"id":"2605.12953","kind":"arxiv","version":1}},"canonical_sha256":"dbca5905d34f2c85797c077fb392b9e50a8706ed252b6fb9ae609463ec214b1f","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"dbca5905d34f2c85797c077fb392b9e50a8706ed252b6fb9ae609463ec214b1f","first_computed_at":"2026-05-18T03:09:09.359460Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T03:09:09.359460Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"oFqpurGJMzLKy4QvXn22wpq/EO8onozRsz4dFsQbvkjAPPvon40AOj56xCNRmKEH1JPtTqicZQOzQ1HHfQqHBA==","signature_status":"signed_v1","signed_at":"2026-05-18T03:09:09.360203Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.12953","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:d1d3f648eee89386ab7e199f84168aa2aa922848555a57cdca932204f8d685aa","sha256:ab69070ffe65175de4521b2b1be546103cab49999a935f795581362f08551b66"],"state_sha256":"b6fdb42a51c164a8cdffd9fd103480c2ac16cfc103cf91986eeb7b9863cd34ea"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"G1jMnlb9NKQ19ZdCDHq5rMzsHTdwaLGbGB0qG0LJwNoEd3G0X0HpFqiJXXQZgvtU4ZnbU6X02/dvPCDaJzyuAw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T00:38:28.916890Z","bundle_sha256":"edac47fd44057dd686364ecd0465c3000baaab3735a331afe67b880dc697abea"}}