{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:DGNFU6XGZCUE4OYM7HEFYNSFRZ","short_pith_number":"pith:DGNFU6XG","schema_version":"1.0","canonical_sha256":"199a5a7ae6c8a84e3b0cf9c85c36458e774305f2018d6e372ceaf163508e4ec7","source":{"kind":"arxiv","id":"2605.21625","version":1},"attestation_state":"computed","paper":{"title":"Flat-Pack Bench: Evaluating Spatio-Temporal Understanding in Large Vision-Language Models through Furniture Assembly","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.CV","authors_text":"Aditya Chetan, Bharath Hariharan, Bharath Raj Nagoor Kani, Eric Cai, Noah Snavely, Peeyush Kushwaha, Qianqian Wang, Utkarsh Mall","submitted_at":"2026-05-20T18:36:57Z","abstract_excerpt":"The emergence of Large Vision-Language Models (LVLMs) has significantly advanced video understanding capabilities. However, existing benchmarks focus predominantly on coarse-grained tasks such as action segmentation, classification, captioning, and retrieval. Furthermore, these benchmarks often rely on entities that can be easily identified verbally, like household objects, animals, human subjects, etc., limiting their applicability to complex, in-the-wild video scenarios. But, many applications such as furniture assembly, cooking, etc., require step-by-step fine-grained spatio-temporal unders"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.21625","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-20T18:36:57Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"f6e8d7a9b2c46f8eb0e8cfaa02e86d1520aa8031adca37ca830f3c0e060afe86","abstract_canon_sha256":"8604e91700112d90e35a79ee79780b8735c019b37e13e09c565c2ec19c671418"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-22T01:03:25.935225Z","signature_b64":"KbME+9WFPMnf2kHdQFsaIzZkOkiZufjibemAPwRpIwXN+vDS9mKfVs4RlHLoFN4XxwHcCQvuUNprSODwSKEaBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"199a5a7ae6c8a84e3b0cf9c85c36458e774305f2018d6e372ceaf163508e4ec7","last_reissued_at":"2026-05-22T01:03:25.934614Z","signature_status":"signed_v1","first_computed_at":"2026-05-22T01:03:25.934614Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Flat-Pack Bench: Evaluating Spatio-Temporal Understanding in Large Vision-Language Models through Furniture Assembly","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.CV","authors_text":"Aditya Chetan, Bharath Hariharan, Bharath Raj Nagoor Kani, Eric Cai, Noah Snavely, Peeyush Kushwaha, Qianqian Wang, Utkarsh Mall","submitted_at":"2026-05-20T18:36:57Z","abstract_excerpt":"The emergence of Large Vision-Language Models (LVLMs) has significantly advanced video understanding capabilities. However, existing benchmarks focus predominantly on coarse-grained tasks such as action segmentation, classification, captioning, and retrieval. Furthermore, these benchmarks often rely on entities that can be easily identified verbally, like household objects, animals, human subjects, etc., limiting their applicability to complex, in-the-wild video scenarios. But, many applications such as furniture assembly, cooking, etc., require step-by-step fine-grained spatio-temporal unders"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.21625","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.21625/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.21625","created_at":"2026-05-22T01:03:25.934693+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.21625v1","created_at":"2026-05-22T01:03:25.934693+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.21625","created_at":"2026-05-22T01:03:25.934693+00:00"},{"alias_kind":"pith_short_12","alias_value":"DGNFU6XGZCUE","created_at":"2026-05-22T01:03:25.934693+00:00"},{"alias_kind":"pith_short_16","alias_value":"DGNFU6XGZCUE4OYM","created_at":"2026-05-22T01:03:25.934693+00:00"},{"alias_kind":"pith_short_8","alias_value":"DGNFU6XG","created_at":"2026-05-22T01:03:25.934693+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/DGNFU6XGZCUE4OYM7HEFYNSFRZ","json":"https://pith.science/pith/DGNFU6XGZCUE4OYM7HEFYNSFRZ.json","graph_json":"https://pith.science/api/pith-number/DGNFU6XGZCUE4OYM7HEFYNSFRZ/graph.json","events_json":"https://pith.science/api/pith-number/DGNFU6XGZCUE4OYM7HEFYNSFRZ/events.json","paper":"https://pith.science/paper/DGNFU6XG"},"agent_actions":{"view_html":"https://pith.science/pith/DGNFU6XGZCUE4OYM7HEFYNSFRZ","download_json":"https://pith.science/pith/DGNFU6XGZCUE4OYM7HEFYNSFRZ.json","view_paper":"https://pith.science/paper/DGNFU6XG","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.21625&json=true","fetch_graph":"https://pith.science/api/pith-number/DGNFU6XGZCUE4OYM7HEFYNSFRZ/graph.json","fetch_events":"https://pith.science/api/pith-number/DGNFU6XGZCUE4OYM7HEFYNSFRZ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/DGNFU6XGZCUE4OYM7HEFYNSFRZ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/DGNFU6XGZCUE4OYM7HEFYNSFRZ/action/storage_attestation","attest_author":"https://pith.science/pith/DGNFU6XGZCUE4OYM7HEFYNSFRZ/action/author_attestation","sign_citation":"https://pith.science/pith/DGNFU6XGZCUE4OYM7HEFYNSFRZ/action/citation_signature","submit_replication":"https://pith.science/pith/DGNFU6XGZCUE4OYM7HEFYNSFRZ/action/replication_record"}},"created_at":"2026-05-22T01:03:25.934693+00:00","updated_at":"2026-05-22T01:03:25.934693+00:00"}