{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:PFSVRTVMWP6CFX6LFLP5JHUH54","short_pith_number":"pith:PFSVRTVM","canonical_record":{"source":{"id":"2605.14498","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T07:38:29Z","cross_cats_sorted":[],"title_canon_sha256":"2cb1a9babb0c11bd583a0fb06ade13e53199b2773ceb18a9971d09d21f9de78f","abstract_canon_sha256":"88edf3ae25000759317ae305503a4c09b7cc901c943ecec0d81cdee0e80a20e2"},"schema_version":"1.0"},"canonical_sha256":"796558ceacb3fc22dfcb2adfd49e87ef0737163bb18c20c2945ae60cee375576","source":{"kind":"arxiv","id":"2605.14498","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14498","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14498v1","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14498","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"pith_short_12","alias_value":"PFSVRTVMWP6C","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"PFSVRTVMWP6CFX6L","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"PFSVRTVM","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:PFSVRTVMWP6CFX6LFLP5JHUH54","target":"record","payload":{"canonical_record":{"source":{"id":"2605.14498","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T07:38:29Z","cross_cats_sorted":[],"title_canon_sha256":"2cb1a9babb0c11bd583a0fb06ade13e53199b2773ceb18a9971d09d21f9de78f","abstract_canon_sha256":"88edf3ae25000759317ae305503a4c09b7cc901c943ecec0d81cdee0e80a20e2"},"schema_version":"1.0"},"canonical_sha256":"796558ceacb3fc22dfcb2adfd49e87ef0737163bb18c20c2945ae60cee375576","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:06.350057Z","signature_b64":"sYk/iZ7oKmqNl73iw1iw7pRAle5inmTKir8y+goLJ2Ldi2DUTDSHtge8mLizSPtdO/nVywHPFYwBkeuk/5VHDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"796558ceacb3fc22dfcb2adfd49e87ef0737163bb18c20c2945ae60cee375576","last_reissued_at":"2026-05-17T23:39:06.349356Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:06.349356Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.14498","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:06Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"WSNnkR9j0UMYzP6Y1CWivD+mTONBOe6m+04XynYfDBY9EV1t9IJt3RoBrwtqfbPhpMmA822lcCE2UuREM3CbAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T07:57:17.778099Z"},"content_sha256":"411f0631d6da68704433572310c776876dfc20cb9bfa2a0a5d2073a2ef58dc1e","schema_version":"1.0","event_id":"sha256:411f0631d6da68704433572310c776876dfc20cb9bfa2a0a5d2073a2ef58dc1e"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:PFSVRTVMWP6CFX6LFLP5JHUH54","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"GroupMemBench: Benchmarking LLM Agent Memory in Multi-Party Conversations","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Benchmarking shows leading LLM memory systems reach only 46 percent accuracy in multi-party conversations, with BM25 matching most.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Evgeniy Gabrilovich, Jingbo Yang, Kwei-Herng Lai, Shiyu Chang, Xiaowen Wang, Yaar Harari","submitted_at":"2026-05-14T07:38:29Z","abstract_excerpt":"Large Language Model (LLM) agents increasingly serve as personal assistants and workplace collaborators, where their utility depends on memory systems that extract, retrieve, and apply information across long-running conversations. However, both existing memory systems and benchmarks are built around the dyadic, single-user setup, even though real deployments routinely span groups and channels with multiple users interacting with the agent and with each other. This mismatch leaves three properties of group memory unmeasured: (i) group dynamics that go beyond concatenated one-on-one chats, (ii)"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Benchmarking leading memory systems exposes a sharp collapse: the strongest one reaches only 46.0% average accuracy, with knowledge update at 27.1% and term ambiguity at 37.7%, while a simple BM25 baseline matches or exceeds most agent memory systems.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The graph-grounded synthesis pipeline and adversarial query generation produce conversations and questions that faithfully capture the three unmeasured properties of group memory (group dynamics, speaker-grounded belief tracking, and audience-adapted language) in real deployments.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"GroupMemBench shows leading LLM memory systems reach only 46% average accuracy on multi-party tasks, with a simple BM25 baseline matching or beating most of them.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Benchmarking shows leading LLM memory systems reach only 46 percent accuracy in multi-party conversations, with BM25 matching most.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"c002da65ea2c46d80e8fc2a204147f819b93d3b73621c3bff30ea264b5f02aad"},"source":{"id":"2605.14498","kind":"arxiv","version":1},"verdict":{"id":"75e20c67-f767-42d8-ad12-4af31b192c03","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T01:50:50.981104Z","strongest_claim":"Benchmarking leading memory systems exposes a sharp collapse: the strongest one reaches only 46.0% average accuracy, with knowledge update at 27.1% and term ambiguity at 37.7%, while a simple BM25 baseline matches or exceeds most agent memory systems.","one_line_summary":"GroupMemBench shows leading LLM memory systems reach only 46% average accuracy on multi-party tasks, with a simple BM25 baseline matching or beating most of them.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The graph-grounded synthesis pipeline and adversarial query generation produce conversations and questions that faithfully capture the three unmeasured properties of group memory (group dynamics, speaker-grounded belief tracking, and audience-adapted language) in real deployments.","pith_extraction_headline":"Benchmarking shows leading LLM memory systems reach only 46 percent accuracy in multi-party conversations, with BM25 matching most."},"references":{"count":47,"sample":[{"doi":"","year":2026,"title":"OpenClaw-RL: Train Any Agent Simply by Talking","work_id":"78607317-8305-4515-8dc3-20b4ff5b8f3a","ref_index":1,"cited_arxiv_id":"2603.10165","is_internal_anchor":true},{"doi":"","year":2026,"title":"Agent Skills for Large Language Models: Architecture, Acquisition, Security, and the Path Forward","work_id":"a64d3985-6826-492c-8a2e-f0965d805bfd","ref_index":2,"cited_arxiv_id":"2602.12430","is_internal_anchor":true},{"doi":"","year":2025,"title":"Trust in ai chatbots: A systematic review.Telematics and Informatics, 97:102240, 2025","work_id":"825eaf4d-ab6a-4555-94bc-8d14282e6f4d","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Personal LLM Agents: Insights and Survey about the Capability, Efficiency and Security","work_id":"7692e42c-c83d-4527-ba16-9c4b69666d47","ref_index":4,"cited_arxiv_id":"2401.05459","is_internal_anchor":true},{"doi":"","year":2024,"title":"A survey on large language model based autonomous agents.Frontiers of Computer Science, 18(6):186345","work_id":"22d0f278-0f8d-4a9a-941a-1a0feef5523b","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":47,"snapshot_sha256":"6b4f85fc79364f472a714008e5e0a1f8c58941441f5466702b755e689a5e9caa","internal_anchors":10},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"75e20c67-f767-42d8-ad12-4af31b192c03"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:06Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"hbkVGhUfU62XWTO4nVD5fihsDoO0vqpYARx7VLZdUWdyrpWe7o08tLkMcA/Mw/jy0Q20DNi+oDOczjThZa3zAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T07:57:17.778638Z"},"content_sha256":"0a9d15f673c284ac08259a6f591ff9471e8cf307caef664846b0a5539657a644","schema_version":"1.0","event_id":"sha256:0a9d15f673c284ac08259a6f591ff9471e8cf307caef664846b0a5539657a644"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/PFSVRTVMWP6CFX6LFLP5JHUH54/bundle.json","state_url":"https://pith.science/pith/PFSVRTVMWP6CFX6LFLP5JHUH54/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/PFSVRTVMWP6CFX6LFLP5JHUH54/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-02T07:57:17Z","links":{"resolver":"https://pith.science/pith/PFSVRTVMWP6CFX6LFLP5JHUH54","bundle":"https://pith.science/pith/PFSVRTVMWP6CFX6LFLP5JHUH54/bundle.json","state":"https://pith.science/pith/PFSVRTVMWP6CFX6LFLP5JHUH54/state.json","well_known_bundle":"https://pith.science/.well-known/pith/PFSVRTVMWP6CFX6LFLP5JHUH54/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:PFSVRTVMWP6CFX6LFLP5JHUH54","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"88edf3ae25000759317ae305503a4c09b7cc901c943ecec0d81cdee0e80a20e2","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T07:38:29Z","title_canon_sha256":"2cb1a9babb0c11bd583a0fb06ade13e53199b2773ceb18a9971d09d21f9de78f"},"schema_version":"1.0","source":{"id":"2605.14498","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14498","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14498v1","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14498","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"pith_short_12","alias_value":"PFSVRTVMWP6C","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"PFSVRTVMWP6CFX6L","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"PFSVRTVM","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:0a9d15f673c284ac08259a6f591ff9471e8cf307caef664846b0a5539657a644","target":"graph","created_at":"2026-05-17T23:39:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Benchmarking leading memory systems exposes a sharp collapse: the strongest one reaches only 46.0% average accuracy, with knowledge update at 27.1% and term ambiguity at 37.7%, while a simple BM25 baseline matches or exceeds most agent memory systems."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The graph-grounded synthesis pipeline and adversarial query generation produce conversations and questions that faithfully capture the three unmeasured properties of group memory (group dynamics, speaker-grounded belief tracking, and audience-adapted language) in real deployments."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"GroupMemBench shows leading LLM memory systems reach only 46% average accuracy on multi-party tasks, with a simple BM25 baseline matching or beating most of them."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Benchmarking shows leading LLM memory systems reach only 46 percent accuracy in multi-party conversations, with BM25 matching most."}],"snapshot_sha256":"c002da65ea2c46d80e8fc2a204147f819b93d3b73621c3bff30ea264b5f02aad"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Large Language Model (LLM) agents increasingly serve as personal assistants and workplace collaborators, where their utility depends on memory systems that extract, retrieve, and apply information across long-running conversations. However, both existing memory systems and benchmarks are built around the dyadic, single-user setup, even though real deployments routinely span groups and channels with multiple users interacting with the agent and with each other. This mismatch leaves three properties of group memory unmeasured: (i) group dynamics that go beyond concatenated one-on-one chats, (ii)","authors_text":"Evgeniy Gabrilovich, Jingbo Yang, Kwei-Herng Lai, Shiyu Chang, Xiaowen Wang, Yaar Harari","cross_cats":[],"headline":"Benchmarking shows leading LLM memory systems reach only 46 percent accuracy in multi-party conversations, with BM25 matching most.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T07:38:29Z","title":"GroupMemBench: Benchmarking LLM Agent Memory in Multi-Party Conversations"},"references":{"count":47,"internal_anchors":10,"resolved_work":47,"sample":[{"cited_arxiv_id":"2603.10165","doi":"","is_internal_anchor":true,"ref_index":1,"title":"OpenClaw-RL: Train Any Agent Simply by Talking","work_id":"78607317-8305-4515-8dc3-20b4ff5b8f3a","year":2026},{"cited_arxiv_id":"2602.12430","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Agent Skills for Large Language Models: Architecture, Acquisition, Security, and the Path Forward","work_id":"a64d3985-6826-492c-8a2e-f0965d805bfd","year":2026},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Trust in ai chatbots: A systematic review.Telematics and Informatics, 97:102240, 2025","work_id":"825eaf4d-ab6a-4555-94bc-8d14282e6f4d","year":2025},{"cited_arxiv_id":"2401.05459","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Personal LLM Agents: Insights and Survey about the Capability, Efficiency and Security","work_id":"7692e42c-c83d-4527-ba16-9c4b69666d47","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"A survey on large language model based autonomous agents.Frontiers of Computer Science, 18(6):186345","work_id":"22d0f278-0f8d-4a9a-941a-1a0feef5523b","year":2024}],"snapshot_sha256":"6b4f85fc79364f472a714008e5e0a1f8c58941441f5466702b755e689a5e9caa"},"source":{"id":"2605.14498","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T01:50:50.981104Z","id":"75e20c67-f767-42d8-ad12-4af31b192c03","model_set":{"reader":"grok-4.3"},"one_line_summary":"GroupMemBench shows leading LLM memory systems reach only 46% average accuracy on multi-party tasks, with a simple BM25 baseline matching or beating most of them.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Benchmarking shows leading LLM memory systems reach only 46 percent accuracy in multi-party conversations, with BM25 matching most.","strongest_claim":"Benchmarking leading memory systems exposes a sharp collapse: the strongest one reaches only 46.0% average accuracy, with knowledge update at 27.1% and term ambiguity at 37.7%, while a simple BM25 baseline matches or exceeds most agent memory systems.","weakest_assumption":"The graph-grounded synthesis pipeline and adversarial query generation produce conversations and questions that faithfully capture the three unmeasured properties of group memory (group dynamics, speaker-grounded belief tracking, and audience-adapted language) in real deployments."}},"verdict_id":"75e20c67-f767-42d8-ad12-4af31b192c03"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:411f0631d6da68704433572310c776876dfc20cb9bfa2a0a5d2073a2ef58dc1e","target":"record","created_at":"2026-05-17T23:39:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"88edf3ae25000759317ae305503a4c09b7cc901c943ecec0d81cdee0e80a20e2","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-14T07:38:29Z","title_canon_sha256":"2cb1a9babb0c11bd583a0fb06ade13e53199b2773ceb18a9971d09d21f9de78f"},"schema_version":"1.0","source":{"id":"2605.14498","kind":"arxiv","version":1}},"canonical_sha256":"796558ceacb3fc22dfcb2adfd49e87ef0737163bb18c20c2945ae60cee375576","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"796558ceacb3fc22dfcb2adfd49e87ef0737163bb18c20c2945ae60cee375576","first_computed_at":"2026-05-17T23:39:06.349356Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:06.349356Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"sYk/iZ7oKmqNl73iw1iw7pRAle5inmTKir8y+goLJ2Ldi2DUTDSHtge8mLizSPtdO/nVywHPFYwBkeuk/5VHDw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:06.350057Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.14498","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:411f0631d6da68704433572310c776876dfc20cb9bfa2a0a5d2073a2ef58dc1e","sha256:0a9d15f673c284ac08259a6f591ff9471e8cf307caef664846b0a5539657a644"],"state_sha256":"80c16a94f82f3d111248511c32eb3e0c76b46028c0fe3f6a922cd3a44fffbe85"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"2lAMzSSgZRm8dcJs58lxro+79hQNZuhl+szK17Je/nj0LQopNyASFAEcI9nd3qt2aoHZjkGed/TYDs+qtAfxCw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-02T07:57:17.781006Z","bundle_sha256":"2842dcc5a9cd4093ca25d31bfb67ff9dd6b25341671c3cacee78c391a0e4453a"}}