{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:FDFQ7MUSVEOSYLYFLZD2XRRULP","short_pith_number":"pith:FDFQ7MUS","schema_version":"1.0","canonical_sha256":"28cb0fb292a91d2c2f055e47abc6345bc1c28c434b73371c73ec20ed07702991","source":{"kind":"arxiv","id":"2604.00913","version":2},"attestation_state":"computed","paper":{"title":"Benchmarking and Mechanistic Analysis of Vision-Language Models for Cross-Depiction Assembly Instruction Alignment","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Yao Zhang, Yu Xiao, Zhuchenyang Liu","submitted_at":"2026-04-01T13:55:28Z","abstract_excerpt":"2D assembly diagrams are often abstract and hard to follow, creating a need for intelligent assistants that can monitor progress, detect errors, and provide step-by-step guidance. In mixed reality settings, such systems must recognize completed and ongoing steps from the camera feed and align them with the diagram instructions. Vision Language Models (VLMs) show promise for this task, but face a depiction gap because assembly diagrams and video frames share few visual features. To systematically assess this gap, we construct IKEA-Bench, a benchmark of 1,623 questions across 6 task types on 29 "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2604.00913","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-04-01T13:55:28Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"d6c4ddf05212ff4ca17fa092c601d139980a7b96cc2d13777e464283bf14c1bb","abstract_canon_sha256":"2fc5868146bde3883500890ee60287312c0088bf72dbb7acf210b0742cddd12b"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-28T01:05:12.239025Z","signature_b64":"biUCXTQijmuV3F/CWkxLtXYRwDjmPk19Q3dQnlXs8HB2i4xcqHQ8YxTFP+0Hkix9BLc/tGOsZf9d5sGjNkPpCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"28cb0fb292a91d2c2f055e47abc6345bc1c28c434b73371c73ec20ed07702991","last_reissued_at":"2026-05-28T01:05:12.238523Z","signature_status":"signed_v1","first_computed_at":"2026-05-28T01:05:12.238523Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Benchmarking and Mechanistic Analysis of Vision-Language Models for Cross-Depiction Assembly Instruction Alignment","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Yao Zhang, Yu Xiao, Zhuchenyang Liu","submitted_at":"2026-04-01T13:55:28Z","abstract_excerpt":"2D assembly diagrams are often abstract and hard to follow, creating a need for intelligent assistants that can monitor progress, detect errors, and provide step-by-step guidance. In mixed reality settings, such systems must recognize completed and ongoing steps from the camera feed and align them with the diagram instructions. Vision Language Models (VLMs) show promise for this task, but face a depiction gap because assembly diagrams and video frames share few visual features. To systematically assess this gap, we construct IKEA-Bench, a benchmark of 1,623 questions across 6 task types on 29 "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2604.00913","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2604.00913/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2604.00913","created_at":"2026-05-28T01:05:12.238583+00:00"},{"alias_kind":"arxiv_version","alias_value":"2604.00913v2","created_at":"2026-05-28T01:05:12.238583+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.00913","created_at":"2026-05-28T01:05:12.238583+00:00"},{"alias_kind":"pith_short_12","alias_value":"FDFQ7MUSVEOS","created_at":"2026-05-28T01:05:12.238583+00:00"},{"alias_kind":"pith_short_16","alias_value":"FDFQ7MUSVEOSYLYF","created_at":"2026-05-28T01:05:12.238583+00:00"},{"alias_kind":"pith_short_8","alias_value":"FDFQ7MUS","created_at":"2026-05-28T01:05:12.238583+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/FDFQ7MUSVEOSYLYFLZD2XRRULP","json":"https://pith.science/pith/FDFQ7MUSVEOSYLYFLZD2XRRULP.json","graph_json":"https://pith.science/api/pith-number/FDFQ7MUSVEOSYLYFLZD2XRRULP/graph.json","events_json":"https://pith.science/api/pith-number/FDFQ7MUSVEOSYLYFLZD2XRRULP/events.json","paper":"https://pith.science/paper/FDFQ7MUS"},"agent_actions":{"view_html":"https://pith.science/pith/FDFQ7MUSVEOSYLYFLZD2XRRULP","download_json":"https://pith.science/pith/FDFQ7MUSVEOSYLYFLZD2XRRULP.json","view_paper":"https://pith.science/paper/FDFQ7MUS","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2604.00913&json=true","fetch_graph":"https://pith.science/api/pith-number/FDFQ7MUSVEOSYLYFLZD2XRRULP/graph.json","fetch_events":"https://pith.science/api/pith-number/FDFQ7MUSVEOSYLYFLZD2XRRULP/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/FDFQ7MUSVEOSYLYFLZD2XRRULP/action/timestamp_anchor","attest_storage":"https://pith.science/pith/FDFQ7MUSVEOSYLYFLZD2XRRULP/action/storage_attestation","attest_author":"https://pith.science/pith/FDFQ7MUSVEOSYLYFLZD2XRRULP/action/author_attestation","sign_citation":"https://pith.science/pith/FDFQ7MUSVEOSYLYFLZD2XRRULP/action/citation_signature","submit_replication":"https://pith.science/pith/FDFQ7MUSVEOSYLYFLZD2XRRULP/action/replication_record"}},"created_at":"2026-05-28T01:05:12.238583+00:00","updated_at":"2026-05-28T01:05:12.238583+00:00"}