{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:IYMC7UPYBQY5DBSAE2VWSD4UF5","short_pith_number":"pith:IYMC7UPY","canonical_record":{"source":{"id":"2602.22455","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-02-25T22:30:56Z","cross_cats_sorted":[],"title_canon_sha256":"da83502ca2eb4f298aae37c31c5d9091276128b436d318396e3bc200263965fb","abstract_canon_sha256":"de4a78cfc23d68ff46cbe0015bda506a43b7a341bc678a10f075d76f1605b88c"},"schema_version":"1.0"},"canonical_sha256":"46182fd1f80c31d1864026ab690f942f477888ca70484fac2f34be35d02566dc","source":{"kind":"arxiv","id":"2602.22455","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.22455","created_at":"2026-05-18T03:09:23Z"},{"alias_kind":"arxiv_version","alias_value":"2602.22455v2","created_at":"2026-05-18T03:09:23Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.22455","created_at":"2026-05-18T03:09:23Z"},{"alias_kind":"pith_short_12","alias_value":"IYMC7UPYBQY5","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"IYMC7UPYBQY5DBSA","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"IYMC7UPY","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:IYMC7UPYBQY5DBSAE2VWSD4UF5","target":"record","payload":{"canonical_record":{"source":{"id":"2602.22455","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-02-25T22:30:56Z","cross_cats_sorted":[],"title_canon_sha256":"da83502ca2eb4f298aae37c31c5d9091276128b436d318396e3bc200263965fb","abstract_canon_sha256":"de4a78cfc23d68ff46cbe0015bda506a43b7a341bc678a10f075d76f1605b88c"},"schema_version":"1.0"},"canonical_sha256":"46182fd1f80c31d1864026ab690f942f477888ca70484fac2f34be35d02566dc","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:09:23.316895Z","signature_b64":"0LKgd2/EZxVLcKRVr4jvIBlvSEfPBdUvxbnHzXMiTgX0/MFJGf8kqH0w+VMkKfHgSvzLrCDXU1NdX6LCMJ3mAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"46182fd1f80c31d1864026ab690f942f477888ca70484fac2f34be35d02566dc","last_reissued_at":"2026-05-18T03:09:23.316142Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:09:23.316142Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2602.22455","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:09:23Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"HOeQ1E70sbKpDuSSeKvwILXJ8bpPeLdZ5feK8HgZGKGe9HwxXhoE8wujD0GlO0D9u4aC/2dm49tTOUKLPyy0Aw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-29T18:33:30.341373Z"},"content_sha256":"f748f81ddf756a32791742088e1e339209c77b5f17f0a784a18b8a097c39490e","schema_version":"1.0","event_id":"sha256:f748f81ddf756a32791742088e1e339209c77b5f17f0a784a18b8a097c39490e"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:IYMC7UPYBQY5DBSAE2VWSD4UF5","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Exploring Multimodal LMMs for Online Episodic Memory Question Answering on the Edge","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Multimodal large language models can run real-time episodic memory question answering on edge devices with accuracy close to cloud services.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Antonino Furnari, Giuseppe Lando, Rosario Forte","submitted_at":"2026-02-25T22:30:56Z","abstract_excerpt":"We investigate the feasibility of using Multimodal Large Language Models (MLLMs) for real-time online episodic memory question answering. While cloud offloading is common, it raises privacy and latency concerns for wearable assistants, hence we investigate implementation on the edge. We integrated streaming constraints into our question answering pipeline, which is structured into two asynchronous threads: a Descriptor Thread that continuously converts video into a lightweight textual memory, and a Question Answering (QA) Thread that reasons over the textual memory to answer queries. Experimen"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"an end-to-end configuration running on a consumer-grade 8GB GPU achieves 51.76% accuracy with a Time-To-First-Token (TTFT) of 0.41s. Scaling to a local enterprise-grade server yields 54.40% accuracy with a TTFT of 0.88s. In comparison, a cloud-based solution obtains an accuracy of 56.00%.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The lightweight textual memory produced by the descriptor thread retains enough visual and temporal detail for the QA thread to answer questions accurately without substantial information loss.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"An edge-deployed multimodal LLM pipeline for online episodic memory QA reaches 51.76% accuracy on an 8 GB GPU and 54.40% on a local server, within 4-5 points of a 56% cloud baseline.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Multimodal large language models can run real-time episodic memory question answering on edge devices with accuracy close to cloud services.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"684f701c5a9fc770239401b3cf9b85b823da3423435e68a694524864e485f6c2"},"source":{"id":"2602.22455","kind":"arxiv","version":2},"verdict":{"id":"f35d7620-2924-46fe-b58c-09a2ab573c72","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T19:02:47.938624Z","strongest_claim":"an end-to-end configuration running on a consumer-grade 8GB GPU achieves 51.76% accuracy with a Time-To-First-Token (TTFT) of 0.41s. Scaling to a local enterprise-grade server yields 54.40% accuracy with a TTFT of 0.88s. In comparison, a cloud-based solution obtains an accuracy of 56.00%.","one_line_summary":"An edge-deployed multimodal LLM pipeline for online episodic memory QA reaches 51.76% accuracy on an 8 GB GPU and 54.40% on a local server, within 4-5 points of a 56% cloud baseline.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The lightweight textual memory produced by the descriptor thread retains enough visual and temporal detail for the QA thread to answer questions accurately without substantial information loss.","pith_extraction_headline":"Multimodal large language models can run real-time episodic memory question answering on edge devices with accuracy close to cloud services."},"references":{"count":14,"sample":[{"doi":"","year":2022,"title":"Bärmann, L. and Waibel, A. (2022). Where did i leave my keys? — episodic-memory-based ques- tion answering on egocentric videos. In 2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition ","work_id":"a60dfde7-d09a-423d-a4b8-a392fa4e3e42","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Q., Song, C., Gao, D., Liu, J.-W., Gao, Z., Mao, D., and Shou, M","work_id":"eb5e7ada-003e-4ed5-a650-bb830431325a","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning","work_id":"fff3953b-5efb-4753-bee4-002f59995810","ref_index":3,"cited_arxiv_id":"2307.08691","is_internal_anchor":true},{"doi":"","year":2024,"title":"Di, S. and Xie, W. (2024). Grounded question- answering in long egocentric videos. In CVPR","work_id":"56871a64-e2c4-4618-bf68-547ec5066ad2","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"arXiv preprint arXiv:2503.00540 , year=","work_id":"f9b28e0b-f48b-484b-a271-22ecec990b86","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":14,"snapshot_sha256":"e8a4113724e5d0e13392a821dbb8cfdc8be7664407da4bfcdaa950d96b35c4fe","internal_anchors":3},"formal_canon":{"evidence_count":1,"snapshot_sha256":"ff9b83e5fa401f07ba156ed5aca0ba5bb460d436ef63c7655e6bd4380647aff0"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"f35d7620-2924-46fe-b58c-09a2ab573c72"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:09:23Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"9QAaVXUks+ijfsagqixGqeuEGdcameKFPAX6XX6M2axWfDbkO9Ehwtd0Y/dqVzh0WrUl9Qp32blhCrDzYIZpBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-29T18:33:30.342087Z"},"content_sha256":"d046dab7c3d3d356060475900e546d404882f012a788dc9956deabd0b0ee1bee","schema_version":"1.0","event_id":"sha256:d046dab7c3d3d356060475900e546d404882f012a788dc9956deabd0b0ee1bee"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/IYMC7UPYBQY5DBSAE2VWSD4UF5/bundle.json","state_url":"https://pith.science/pith/IYMC7UPYBQY5DBSAE2VWSD4UF5/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/IYMC7UPYBQY5DBSAE2VWSD4UF5/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-29T18:33:30Z","links":{"resolver":"https://pith.science/pith/IYMC7UPYBQY5DBSAE2VWSD4UF5","bundle":"https://pith.science/pith/IYMC7UPYBQY5DBSAE2VWSD4UF5/bundle.json","state":"https://pith.science/pith/IYMC7UPYBQY5DBSAE2VWSD4UF5/state.json","well_known_bundle":"https://pith.science/.well-known/pith/IYMC7UPYBQY5DBSAE2VWSD4UF5/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:IYMC7UPYBQY5DBSAE2VWSD4UF5","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"de4a78cfc23d68ff46cbe0015bda506a43b7a341bc678a10f075d76f1605b88c","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-02-25T22:30:56Z","title_canon_sha256":"da83502ca2eb4f298aae37c31c5d9091276128b436d318396e3bc200263965fb"},"schema_version":"1.0","source":{"id":"2602.22455","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.22455","created_at":"2026-05-18T03:09:23Z"},{"alias_kind":"arxiv_version","alias_value":"2602.22455v2","created_at":"2026-05-18T03:09:23Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.22455","created_at":"2026-05-18T03:09:23Z"},{"alias_kind":"pith_short_12","alias_value":"IYMC7UPYBQY5","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"IYMC7UPYBQY5DBSA","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"IYMC7UPY","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:d046dab7c3d3d356060475900e546d404882f012a788dc9956deabd0b0ee1bee","target":"graph","created_at":"2026-05-18T03:09:23Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"an end-to-end configuration running on a consumer-grade 8GB GPU achieves 51.76% accuracy with a Time-To-First-Token (TTFT) of 0.41s. Scaling to a local enterprise-grade server yields 54.40% accuracy with a TTFT of 0.88s. In comparison, a cloud-based solution obtains an accuracy of 56.00%."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The lightweight textual memory produced by the descriptor thread retains enough visual and temporal detail for the QA thread to answer questions accurately without substantial information loss."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"An edge-deployed multimodal LLM pipeline for online episodic memory QA reaches 51.76% accuracy on an 8 GB GPU and 54.40% on a local server, within 4-5 points of a 56% cloud baseline."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Multimodal large language models can run real-time episodic memory question answering on edge devices with accuracy close to cloud services."}],"snapshot_sha256":"684f701c5a9fc770239401b3cf9b85b823da3423435e68a694524864e485f6c2"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"ff9b83e5fa401f07ba156ed5aca0ba5bb460d436ef63c7655e6bd4380647aff0"},"paper":{"abstract_excerpt":"We investigate the feasibility of using Multimodal Large Language Models (MLLMs) for real-time online episodic memory question answering. While cloud offloading is common, it raises privacy and latency concerns for wearable assistants, hence we investigate implementation on the edge. We integrated streaming constraints into our question answering pipeline, which is structured into two asynchronous threads: a Descriptor Thread that continuously converts video into a lightweight textual memory, and a Question Answering (QA) Thread that reasons over the textual memory to answer queries. Experimen","authors_text":"Antonino Furnari, Giuseppe Lando, Rosario Forte","cross_cats":[],"headline":"Multimodal large language models can run real-time episodic memory question answering on edge devices with accuracy close to cloud services.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-02-25T22:30:56Z","title":"Exploring Multimodal LMMs for Online Episodic Memory Question Answering on the Edge"},"references":{"count":14,"internal_anchors":3,"resolved_work":14,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Bärmann, L. and Waibel, A. (2022). Where did i leave my keys? — episodic-memory-based ques- tion answering on egocentric videos. In 2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition ","work_id":"a60dfde7-d09a-423d-a4b8-a392fa4e3e42","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Q., Song, C., Gao, D., Liu, J.-W., Gao, Z., Mao, D., and Shou, M","work_id":"eb5e7ada-003e-4ed5-a650-bb830431325a","year":2024},{"cited_arxiv_id":"2307.08691","doi":"","is_internal_anchor":true,"ref_index":3,"title":"FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning","work_id":"fff3953b-5efb-4753-bee4-002f59995810","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Di, S. and Xie, W. (2024). Grounded question- answering in long egocentric videos. In CVPR","work_id":"56871a64-e2c4-4618-bf68-547ec5066ad2","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"arXiv preprint arXiv:2503.00540 , year=","work_id":"f9b28e0b-f48b-484b-a271-22ecec990b86","year":2025}],"snapshot_sha256":"e8a4113724e5d0e13392a821dbb8cfdc8be7664407da4bfcdaa950d96b35c4fe"},"source":{"id":"2602.22455","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T19:02:47.938624Z","id":"f35d7620-2924-46fe-b58c-09a2ab573c72","model_set":{"reader":"grok-4.3"},"one_line_summary":"An edge-deployed multimodal LLM pipeline for online episodic memory QA reaches 51.76% accuracy on an 8 GB GPU and 54.40% on a local server, within 4-5 points of a 56% cloud baseline.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Multimodal large language models can run real-time episodic memory question answering on edge devices with accuracy close to cloud services.","strongest_claim":"an end-to-end configuration running on a consumer-grade 8GB GPU achieves 51.76% accuracy with a Time-To-First-Token (TTFT) of 0.41s. Scaling to a local enterprise-grade server yields 54.40% accuracy with a TTFT of 0.88s. In comparison, a cloud-based solution obtains an accuracy of 56.00%.","weakest_assumption":"The lightweight textual memory produced by the descriptor thread retains enough visual and temporal detail for the QA thread to answer questions accurately without substantial information loss."}},"verdict_id":"f35d7620-2924-46fe-b58c-09a2ab573c72"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f748f81ddf756a32791742088e1e339209c77b5f17f0a784a18b8a097c39490e","target":"record","created_at":"2026-05-18T03:09:23Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"de4a78cfc23d68ff46cbe0015bda506a43b7a341bc678a10f075d76f1605b88c","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-02-25T22:30:56Z","title_canon_sha256":"da83502ca2eb4f298aae37c31c5d9091276128b436d318396e3bc200263965fb"},"schema_version":"1.0","source":{"id":"2602.22455","kind":"arxiv","version":2}},"canonical_sha256":"46182fd1f80c31d1864026ab690f942f477888ca70484fac2f34be35d02566dc","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"46182fd1f80c31d1864026ab690f942f477888ca70484fac2f34be35d02566dc","first_computed_at":"2026-05-18T03:09:23.316142Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T03:09:23.316142Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"0LKgd2/EZxVLcKRVr4jvIBlvSEfPBdUvxbnHzXMiTgX0/MFJGf8kqH0w+VMkKfHgSvzLrCDXU1NdX6LCMJ3mAg==","signature_status":"signed_v1","signed_at":"2026-05-18T03:09:23.316895Z","signed_message":"canonical_sha256_bytes"},"source_id":"2602.22455","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f748f81ddf756a32791742088e1e339209c77b5f17f0a784a18b8a097c39490e","sha256:d046dab7c3d3d356060475900e546d404882f012a788dc9956deabd0b0ee1bee"],"state_sha256":"1d3deea0e235179d80611c7f11773df6ba8a6f5240fa11625319443b9d082894"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"XdnLr9Qt0MZj7AICUHluCUM0sAnxapztWCACQGDNIFbHF7LHwbOVgezy/cjnS1ARIs+MRs1aLvDQ+Bx81A7qCQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-29T18:33:30.345895Z","bundle_sha256":"4b416869d776d6e06fc5831886fe58540b22c66e1243573f2780f083090ae18f"}}