{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:IDC3K2XENZD5TP7DK7CGDHKNII","short_pith_number":"pith:IDC3K2XE","schema_version":"1.0","canonical_sha256":"40c5b56ae46e47d9bfe357c4619d4d422ccd48992c16893d1fe6e3172e1b6450","source":{"kind":"arxiv","id":"2504.04540","version":2},"attestation_state":"computed","paper":{"title":"The Point, the Vision and the Text: Does Point Cloud Boost Spatial Reasoning of Large Language Models? A Bias-Controlled Study","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Chen Gao, Heng Dong, Jianjie Fang, Kaiyuan Li, Ruiying Peng, Weichen Zhang, Wei Li, Xinlei Chen, Xin Wang, Xin Zeng, Yong Li, Ziyou Wang","submitted_at":"2025-04-06T16:38:48Z","abstract_excerpt":"3D Large Language Models (LLMs) leveraging spatial information in point clouds for 3D spatial reasoning attract great attention. Despite some promising results, the advantages of point clouds over other modalities remain unclear. Moreover, existing 3D benchmarks are insufficient for fairly evaluating the ability of multimodal LLMs to comprehend spatial concepts. To address these challenges, we introduce ScanReQA, a 3D spatial reasoning benchmark encompassing text, vision, and point cloud modalities. We then evaluate the performance of text, 2D, and 3D LLMs on the benchmark to compare the effec"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2504.04540","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-04-06T16:38:48Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"c6763f280ab262e1b4861ba5aa45d786e3f6da1713499895742470ae9a02d6f7","abstract_canon_sha256":"3bd15c244d80a6a7e41ce2c585cf43d245cd8906261613ff52a1b559c1dffee8"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-28T02:04:40.340775Z","signature_b64":"i3puqH0Wl0jnCWUfVxkuEdy+x4xM2cPny8e4ZfZSw619JatT9UUxpJ6jvT7I9MpwFZhrJCX/kiE/IzpWkATMAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"40c5b56ae46e47d9bfe357c4619d4d422ccd48992c16893d1fe6e3172e1b6450","last_reissued_at":"2026-05-28T02:04:40.340127Z","signature_status":"signed_v1","first_computed_at":"2026-05-28T02:04:40.340127Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"The Point, the Vision and the Text: Does Point Cloud Boost Spatial Reasoning of Large Language Models? A Bias-Controlled Study","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Chen Gao, Heng Dong, Jianjie Fang, Kaiyuan Li, Ruiying Peng, Weichen Zhang, Wei Li, Xinlei Chen, Xin Wang, Xin Zeng, Yong Li, Ziyou Wang","submitted_at":"2025-04-06T16:38:48Z","abstract_excerpt":"3D Large Language Models (LLMs) leveraging spatial information in point clouds for 3D spatial reasoning attract great attention. Despite some promising results, the advantages of point clouds over other modalities remain unclear. Moreover, existing 3D benchmarks are insufficient for fairly evaluating the ability of multimodal LLMs to comprehend spatial concepts. To address these challenges, we introduce ScanReQA, a 3D spatial reasoning benchmark encompassing text, vision, and point cloud modalities. We then evaluate the performance of text, 2D, and 3D LLMs on the benchmark to compare the effec"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2504.04540","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2504.04540/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2504.04540","created_at":"2026-05-28T02:04:40.340218+00:00"},{"alias_kind":"arxiv_version","alias_value":"2504.04540v2","created_at":"2026-05-28T02:04:40.340218+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2504.04540","created_at":"2026-05-28T02:04:40.340218+00:00"},{"alias_kind":"pith_short_12","alias_value":"IDC3K2XENZD5","created_at":"2026-05-28T02:04:40.340218+00:00"},{"alias_kind":"pith_short_16","alias_value":"IDC3K2XENZD5TP7D","created_at":"2026-05-28T02:04:40.340218+00:00"},{"alias_kind":"pith_short_8","alias_value":"IDC3K2XE","created_at":"2026-05-28T02:04:40.340218+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":4,"internal_anchor_count":4,"sample":[{"citing_arxiv_id":"2605.21625","citing_title":"Flat-Pack Bench: Evaluating Spatio-Temporal Understanding in Large Vision-Language Models through Furniture Assembly","ref_index":40,"is_internal_anchor":true},{"citing_arxiv_id":"2505.19713","citing_title":"CAD-Coder: Text-to-CAD Generation with Chain-of-Thought and Geometric Reward","ref_index":37,"is_internal_anchor":true},{"citing_arxiv_id":"2603.08592","citing_title":"Boosting MLLM Spatial Reasoning with Geometrically Referenced 3D Scene Representations","ref_index":42,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09218","citing_title":"Flame3D: Zero-shot Compositional Reasoning of 3D Scenes with Agentic Language Models","ref_index":6,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/IDC3K2XENZD5TP7DK7CGDHKNII","json":"https://pith.science/pith/IDC3K2XENZD5TP7DK7CGDHKNII.json","graph_json":"https://pith.science/api/pith-number/IDC3K2XENZD5TP7DK7CGDHKNII/graph.json","events_json":"https://pith.science/api/pith-number/IDC3K2XENZD5TP7DK7CGDHKNII/events.json","paper":"https://pith.science/paper/IDC3K2XE"},"agent_actions":{"view_html":"https://pith.science/pith/IDC3K2XENZD5TP7DK7CGDHKNII","download_json":"https://pith.science/pith/IDC3K2XENZD5TP7DK7CGDHKNII.json","view_paper":"https://pith.science/paper/IDC3K2XE","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2504.04540&json=true","fetch_graph":"https://pith.science/api/pith-number/IDC3K2XENZD5TP7DK7CGDHKNII/graph.json","fetch_events":"https://pith.science/api/pith-number/IDC3K2XENZD5TP7DK7CGDHKNII/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/IDC3K2XENZD5TP7DK7CGDHKNII/action/timestamp_anchor","attest_storage":"https://pith.science/pith/IDC3K2XENZD5TP7DK7CGDHKNII/action/storage_attestation","attest_author":"https://pith.science/pith/IDC3K2XENZD5TP7DK7CGDHKNII/action/author_attestation","sign_citation":"https://pith.science/pith/IDC3K2XENZD5TP7DK7CGDHKNII/action/citation_signature","submit_replication":"https://pith.science/pith/IDC3K2XENZD5TP7DK7CGDHKNII/action/replication_record"}},"created_at":"2026-05-28T02:04:40.340218+00:00","updated_at":"2026-05-28T02:04:40.340218+00:00"}