{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:2PUHZY6NP4F22ZWHTP3P4ZP6RY","short_pith_number":"pith:2PUHZY6N","canonical_record":{"source":{"id":"2605.14483","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-14T07:24:09Z","cross_cats_sorted":[],"title_canon_sha256":"4e19252c61acca31c7e638280dea611978d7de2676970ccf31a539c3e48334e0","abstract_canon_sha256":"236757d0ccb35a66f11aa890ef56facf0f58ebbd3968e1fd69fee72bd6ec8a80"},"schema_version":"1.0"},"canonical_sha256":"d3e87ce3cd7f0bad66c79bf6fe65fe8e3a22fa6f9717bbd618046fcf5f8ff633","source":{"kind":"arxiv","id":"2605.14483","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14483","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14483v1","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14483","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"pith_short_12","alias_value":"2PUHZY6NP4F2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"2PUHZY6NP4F22ZWH","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"2PUHZY6N","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:2PUHZY6NP4F22ZWHTP3P4ZP6RY","target":"record","payload":{"canonical_record":{"source":{"id":"2605.14483","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-14T07:24:09Z","cross_cats_sorted":[],"title_canon_sha256":"4e19252c61acca31c7e638280dea611978d7de2676970ccf31a539c3e48334e0","abstract_canon_sha256":"236757d0ccb35a66f11aa890ef56facf0f58ebbd3968e1fd69fee72bd6ec8a80"},"schema_version":"1.0"},"canonical_sha256":"d3e87ce3cd7f0bad66c79bf6fe65fe8e3a22fa6f9717bbd618046fcf5f8ff633","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:06.526537Z","signature_b64":"rIhyDe+aY5dNHgy3H7qQzsIa0jywdniO2c2i+fMClEwFUsUYspCqU8iQfBqnnvtEKtQ9I5OQSR9rmCWWJvueDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d3e87ce3cd7f0bad66c79bf6fe65fe8e3a22fa6f9717bbd618046fcf5f8ff633","last_reissued_at":"2026-05-17T23:39:06.525807Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:06.525807Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.14483","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:06Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"FIVfoOkZA9caHVikPv8rA/3NqIGpmUJmxHiA4MEZD+vQVt6lWpZnFg3B9Vtq52jLaSbU9Qfhp/kjLZn9XTZOBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T02:47:58.933553Z"},"content_sha256":"ebd61578028f500248a5aa6260c08b07cadd3d87ef69bc42eca26fd486fffe1f","schema_version":"1.0","event_id":"sha256:ebd61578028f500248a5aa6260c08b07cadd3d87ef69bc42eca26fd486fffe1f"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:2PUHZY6NP4F22ZWHTP3P4ZP6RY","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"LEMON: Learning Executable Multi-Agent Orchestration via Counterfactual Reinforcement Learning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Training via localized counterfactual edits allows an LLM to generate executable multi-agent orchestrations that outperform prior methods on reasoning and coding benchmarks.","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Hua Wei, Kaize Ding, Xudong Chen, Yixin Liu","submitted_at":"2026-05-14T07:24:09Z","abstract_excerpt":"Large language models (LLMs) have become a strong foundation for multi-agent systems, but their effectiveness depends heavily on orchestration design. Across different tasks, role design, capacity assignment, and dependency construction jointly affect both solution quality and execution efficiency. Existing approaches automate parts of this design process, yet they often optimize these decisions partially or sequentially, and rely on execution-level feedback that provides limited credit assignment for local orchestration decisions. We propose LEMON (\\textbf{L}earning \\textbf{E}xecutable \\textb"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"LEMON achieves state-of-the-art performance among the evaluated multi-agent orchestration methods on six reasoning and coding benchmarks including MMLU, GSM8K, AQuA, MultiArith, SVAMP, and HumanEval.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That editing single orchestration fields and measuring the resulting reward contrast supplies reliable, localized credit assignment superior to standard execution-level feedback.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"LEMON trains an LLM orchestrator with counterfactual-augmented GRPO to produce deployable multi-agent specifications that reach state-of-the-art results on six reasoning and coding benchmarks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Training via localized counterfactual edits allows an LLM to generate executable multi-agent orchestrations that outperform prior methods on reasoning and coding benchmarks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"7b224a068da5e6ea6e0e678419486e10c1c86102ac67b8b536c25faa1d982b31"},"source":{"id":"2605.14483","kind":"arxiv","version":1},"verdict":{"id":"0fbdaae8-fae4-418d-9900-b76c5d4c3aa9","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T02:10:02.859340Z","strongest_claim":"LEMON achieves state-of-the-art performance among the evaluated multi-agent orchestration methods on six reasoning and coding benchmarks including MMLU, GSM8K, AQuA, MultiArith, SVAMP, and HumanEval.","one_line_summary":"LEMON trains an LLM orchestrator with counterfactual-augmented GRPO to produce deployable multi-agent specifications that reach state-of-the-art results on six reasoning and coding benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That editing single orchestration fields and measuring the resulting reward contrast supplies reliable, localized credit assignment superior to standard execution-level feedback.","pith_extraction_headline":"Training via localized counterfactual edits allows an LLM to generate executable multi-agent orchestrations that outperform prior methods on reasoning and coding benchmarks."},"references":{"count":35,"sample":[{"doi":"","year":2024,"title":"Autogen: Enabling next-gen llm applications via multi-agent conversations","work_id":"e57ce12a-7d16-4d21-a253-28bdb8094e1a","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Camel: Communicative agents for\" mind\" exploration of large language model society.Advances in neural information processing systems, 36:51991–52008","work_id":"f0e6f682-8c56-41b2-8626-859078781db0","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Metagpt: Meta programming for a multi-agent collaborative framework","work_id":"c406797c-06b5-46e2-b568-86ecd25692f1","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Large Language Model based Multi-Agents: A Survey of Progress and Challenges","work_id":"fb905249-ea5f-4765-80f0-2428ea66f15f","ref_index":4,"cited_arxiv_id":"2402.01680","is_internal_anchor":true},{"doi":"","year":2024,"title":"Improv- ing factuality and reasoning in language models through multiagent debate","work_id":"9479970d-f903-4986-a6bc-d40079ce2a58","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":35,"snapshot_sha256":"bf5b04dc9802d00a97d73ccc6bfcd225fc2f58af1f3e979dc485bb98f69f77db","internal_anchors":8},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"0fbdaae8-fae4-418d-9900-b76c5d4c3aa9"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:06Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"RLmF3KCBIN/lHQ59O+joQoPPm2JasoCeb6KJVdvsWUD4/9/61BvUaAtUk2rgA/NATmApCXIrBZhbe+vS+wN2CQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T02:47:58.934787Z"},"content_sha256":"f559efbc8a2c44828c6e49931df0b2b10ccdf5d907bef0d87fb1914770e058cf","schema_version":"1.0","event_id":"sha256:f559efbc8a2c44828c6e49931df0b2b10ccdf5d907bef0d87fb1914770e058cf"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/2PUHZY6NP4F22ZWHTP3P4ZP6RY/bundle.json","state_url":"https://pith.science/pith/2PUHZY6NP4F22ZWHTP3P4ZP6RY/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/2PUHZY6NP4F22ZWHTP3P4ZP6RY/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T02:47:58Z","links":{"resolver":"https://pith.science/pith/2PUHZY6NP4F22ZWHTP3P4ZP6RY","bundle":"https://pith.science/pith/2PUHZY6NP4F22ZWHTP3P4ZP6RY/bundle.json","state":"https://pith.science/pith/2PUHZY6NP4F22ZWHTP3P4ZP6RY/state.json","well_known_bundle":"https://pith.science/.well-known/pith/2PUHZY6NP4F22ZWHTP3P4ZP6RY/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:2PUHZY6NP4F22ZWHTP3P4ZP6RY","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"236757d0ccb35a66f11aa890ef56facf0f58ebbd3968e1fd69fee72bd6ec8a80","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-14T07:24:09Z","title_canon_sha256":"4e19252c61acca31c7e638280dea611978d7de2676970ccf31a539c3e48334e0"},"schema_version":"1.0","source":{"id":"2605.14483","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14483","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14483v1","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14483","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"pith_short_12","alias_value":"2PUHZY6NP4F2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"2PUHZY6NP4F22ZWH","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"2PUHZY6N","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:f559efbc8a2c44828c6e49931df0b2b10ccdf5d907bef0d87fb1914770e058cf","target":"graph","created_at":"2026-05-17T23:39:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"LEMON achieves state-of-the-art performance among the evaluated multi-agent orchestration methods on six reasoning and coding benchmarks including MMLU, GSM8K, AQuA, MultiArith, SVAMP, and HumanEval."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That editing single orchestration fields and measuring the resulting reward contrast supplies reliable, localized credit assignment superior to standard execution-level feedback."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"LEMON trains an LLM orchestrator with counterfactual-augmented GRPO to produce deployable multi-agent specifications that reach state-of-the-art results on six reasoning and coding benchmarks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Training via localized counterfactual edits allows an LLM to generate executable multi-agent orchestrations that outperform prior methods on reasoning and coding benchmarks."}],"snapshot_sha256":"7b224a068da5e6ea6e0e678419486e10c1c86102ac67b8b536c25faa1d982b31"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Large language models (LLMs) have become a strong foundation for multi-agent systems, but their effectiveness depends heavily on orchestration design. Across different tasks, role design, capacity assignment, and dependency construction jointly affect both solution quality and execution efficiency. Existing approaches automate parts of this design process, yet they often optimize these decisions partially or sequentially, and rely on execution-level feedback that provides limited credit assignment for local orchestration decisions. We propose LEMON (\\textbf{L}earning \\textbf{E}xecutable \\textb","authors_text":"Hua Wei, Kaize Ding, Xudong Chen, Yixin Liu","cross_cats":[],"headline":"Training via localized counterfactual edits allows an LLM to generate executable multi-agent orchestrations that outperform prior methods on reasoning and coding benchmarks.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-14T07:24:09Z","title":"LEMON: Learning Executable Multi-Agent Orchestration via Counterfactual Reinforcement Learning"},"references":{"count":35,"internal_anchors":8,"resolved_work":35,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Autogen: Enabling next-gen llm applications via multi-agent conversations","work_id":"e57ce12a-7d16-4d21-a253-28bdb8094e1a","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Camel: Communicative agents for\" mind\" exploration of large language model society.Advances in neural information processing systems, 36:51991–52008","work_id":"f0e6f682-8c56-41b2-8626-859078781db0","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Metagpt: Meta programming for a multi-agent collaborative framework","work_id":"c406797c-06b5-46e2-b568-86ecd25692f1","year":2023},{"cited_arxiv_id":"2402.01680","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Large Language Model based Multi-Agents: A Survey of Progress and Challenges","work_id":"fb905249-ea5f-4765-80f0-2428ea66f15f","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Improv- ing factuality and reasoning in language models through multiagent debate","work_id":"9479970d-f903-4986-a6bc-d40079ce2a58","year":2024}],"snapshot_sha256":"bf5b04dc9802d00a97d73ccc6bfcd225fc2f58af1f3e979dc485bb98f69f77db"},"source":{"id":"2605.14483","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T02:10:02.859340Z","id":"0fbdaae8-fae4-418d-9900-b76c5d4c3aa9","model_set":{"reader":"grok-4.3"},"one_line_summary":"LEMON trains an LLM orchestrator with counterfactual-augmented GRPO to produce deployable multi-agent specifications that reach state-of-the-art results on six reasoning and coding benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Training via localized counterfactual edits allows an LLM to generate executable multi-agent orchestrations that outperform prior methods on reasoning and coding benchmarks.","strongest_claim":"LEMON achieves state-of-the-art performance among the evaluated multi-agent orchestration methods on six reasoning and coding benchmarks including MMLU, GSM8K, AQuA, MultiArith, SVAMP, and HumanEval.","weakest_assumption":"That editing single orchestration fields and measuring the resulting reward contrast supplies reliable, localized credit assignment superior to standard execution-level feedback."}},"verdict_id":"0fbdaae8-fae4-418d-9900-b76c5d4c3aa9"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:ebd61578028f500248a5aa6260c08b07cadd3d87ef69bc42eca26fd486fffe1f","target":"record","created_at":"2026-05-17T23:39:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"236757d0ccb35a66f11aa890ef56facf0f58ebbd3968e1fd69fee72bd6ec8a80","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2026-05-14T07:24:09Z","title_canon_sha256":"4e19252c61acca31c7e638280dea611978d7de2676970ccf31a539c3e48334e0"},"schema_version":"1.0","source":{"id":"2605.14483","kind":"arxiv","version":1}},"canonical_sha256":"d3e87ce3cd7f0bad66c79bf6fe65fe8e3a22fa6f9717bbd618046fcf5f8ff633","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"d3e87ce3cd7f0bad66c79bf6fe65fe8e3a22fa6f9717bbd618046fcf5f8ff633","first_computed_at":"2026-05-17T23:39:06.525807Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:06.525807Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"rIhyDe+aY5dNHgy3H7qQzsIa0jywdniO2c2i+fMClEwFUsUYspCqU8iQfBqnnvtEKtQ9I5OQSR9rmCWWJvueDQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:06.526537Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.14483","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:ebd61578028f500248a5aa6260c08b07cadd3d87ef69bc42eca26fd486fffe1f","sha256:f559efbc8a2c44828c6e49931df0b2b10ccdf5d907bef0d87fb1914770e058cf"],"state_sha256":"6eb391ce314b65a3f53aa19f528eda33151d605900c755ef0e1e074b4f99d525"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"E7QMu5AWwzqEGRRUH9l07jGGN6p/AUnzf4qGE5uhRfrgQmXCaJ9r6eLg1lv1VaSPvVr53L7sEDWGqQVuraeZAw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T02:47:58.939659Z","bundle_sha256":"adb9fa60f645c1025313d94b61237b071a82e29ecac4a8cf44779282a1331fac"}}