{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:OLP63MQOF4LHYFUEL6YGVXI3JS","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"006016a9eced51157013869d7de6a7d2da1b2fa238e23ef597a47208063461c4","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.MA","submitted_at":"2026-05-15T07:41:41Z","title_canon_sha256":"7af84afdb813ba905d2eb1e6e8a49edbc5645b0cbfd8d8ac577d572a77ecc8dd"},"schema_version":"1.0","source":{"id":"2605.15697","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.15697","created_at":"2026-05-20T00:01:13Z"},{"alias_kind":"arxiv_version","alias_value":"2605.15697v1","created_at":"2026-05-20T00:01:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.15697","created_at":"2026-05-20T00:01:13Z"},{"alias_kind":"pith_short_12","alias_value":"OLP63MQOF4LH","created_at":"2026-05-20T00:01:13Z"},{"alias_kind":"pith_short_16","alias_value":"OLP63MQOF4LHYFUE","created_at":"2026-05-20T00:01:13Z"},{"alias_kind":"pith_short_8","alias_value":"OLP63MQO","created_at":"2026-05-20T00:01:13Z"}],"graph_snapshots":[{"event_id":"sha256:000d6fdffccd3eb3f2ce59049f7d028c07e8b12bf8b81daabd5c66c6521063de","target":"graph","created_at":"2026-05-20T00:01:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"We further rigorously establish that the proposed algorithm converges to an ε-stationary point with polynomial sample complexity."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The human preference feedback generated from spatiotemporally truncated trajectories (H-horizon pairs aggregated over each agent's κ-hop neighborhood) depends solely on local state-action information and can be used to produce unbiased estimates of each agent's local policy gradient without requiring global state or explicit rewards."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"A distributed zeroth-order policy gradient algorithm allows networked agents to collaboratively optimize policies using only local human preference feedback on H-horizon trajectory pairs from kappa-hop neighborhoods, with proven convergence to an epsilon-stationary point."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Agents in a network can learn collaborative policies from local human feedback on short trajectory pairs without needing global states or reward signals."}],"snapshot_sha256":"143e08fcc34bf9025b6b75aefc45550e75a917d97381025d0fb502ced044ff68"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"69548307985809d66962e90c8d9957ab4da6acad5d646705e840c2aff1b261ad"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"ai_meta_artifact","ran_at":"2026-05-19T19:33:28.765573Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-19T19:31:30.107941Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_title_agreement","ran_at":"2026-05-19T19:31:19.127308Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"claim_evidence","ran_at":"2026-05-19T17:21:56.037442Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2605.15697/integrity.json","findings":[],"snapshot_sha256":"eb1018ee5a8f8df3be0a8d5f594f05dfd39b07db286f0e3d6ae0b72a680b7180","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"We study a networked multi-agent reinforcement learning (NMARL) problem with human feedback in an infinite-horizon setting, where agents interact over an underlying network with localized state dependencies and aim to collaboratively maximize the average discounted return. Existing approaches with preference feedback are primarily developed for single-agent settings and rely on centralized training, which limits their scalability and applicability to large-scale networked multi-agent systems. To address this, we introduce a novel human feedback mechanism based on spatiotemporally truncated tra","authors_text":"Dongming Wang, He Wang, Jian Qin, Pengcheng Dai, Wenwu Yu","cross_cats":[],"headline":"Agents in a network can learn collaborative policies from local human feedback on short trajectory pairs without needing global states or reward signals.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.MA","submitted_at":"2026-05-15T07:41:41Z","title":"Distributed Zeroth-Order Policy Gradient for Networked Multi-agent Reinforcement Learning from Human Feedback"},"references":{"count":36,"internal_anchors":1,"resolved_work":36,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Dai, P., Yu, W., Wen, G., & Baldi, S. (2020). Distributed reinforcement learning algorithm for dynamic economic dispatch with unknown generation cost functions. IEEE Transactions on Industrial Informa","work_id":"53a31e8b-026f-4a64-96e6-fa2dde8095c6","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Li, F., Qin, J., & Zheng, W. (2020). Distributed Q -learning-based online optimization algorithm for unit commitment and dispatch in smart grid. IEEE Transactions on Cybernetics, 50(9), 4146-4156","work_id":"a2dc5302-a6ff-4954-af46-3a9d7fbf396b","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Dai, P., Yu, W., & Chen, D. (2022). Distributed Q-learning algorithm for dynamic resource allocation with unknown objective functions and application to microgrid. IEEE Transactions on Cybernetics, 52","work_id":"a401ed89-f97e-4131-8ca3-cdc1e5dccb8a","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Chu, T., Wang, J., Codec\\` a , L., & Li, Z. (2020). Multi-agent deep reinforcement learning for large-scale traffic signal control. IEEE Transactions on Intelligent Transportation Systems, 21(3), 1086","work_id":"3916716f-1b58-4cf6-8100-34005ca94e46","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Wang, X., Ke, L., Qiao, Z., & Chai, X. (2021). Large-scale traffic signal control using a novel multiagent reinforcement learning. IEEE Transactions on Cybernetics, 51(1), 174-187","work_id":"ce4e9080-4831-49af-b56a-adac685f598d","year":2021}],"snapshot_sha256":"e48fe26c63420b8437b0b35622cce4e60e274e1db2f28474067ab0942c1f0659"},"source":{"id":"2605.15697","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-19T19:18:05.623397Z","id":"63474267-b586-40fe-8e45-922d1926a0bc","model_set":{"reader":"grok-4.3"},"one_line_summary":"A distributed zeroth-order policy gradient algorithm allows networked agents to collaboratively optimize policies using only local human preference feedback on H-horizon trajectory pairs from kappa-hop neighborhoods, with proven convergence to an epsilon-stationary point.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Agents in a network can learn collaborative policies from local human feedback on short trajectory pairs without needing global states or reward signals.","strongest_claim":"We further rigorously establish that the proposed algorithm converges to an ε-stationary point with polynomial sample complexity.","weakest_assumption":"The human preference feedback generated from spatiotemporally truncated trajectories (H-horizon pairs aggregated over each agent's κ-hop neighborhood) depends solely on local state-action information and can be used to produce unbiased estimates of each agent's local policy gradient without requiring global state or explicit rewards."}},"verdict_id":"63474267-b586-40fe-8e45-922d1926a0bc"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:ed625b064a55abe0c3a46514659417512851df7e85cfd73388e656388ed224af","target":"record","created_at":"2026-05-20T00:01:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"006016a9eced51157013869d7de6a7d2da1b2fa238e23ef597a47208063461c4","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.MA","submitted_at":"2026-05-15T07:41:41Z","title_canon_sha256":"7af84afdb813ba905d2eb1e6e8a49edbc5645b0cbfd8d8ac577d572a77ecc8dd"},"schema_version":"1.0","source":{"id":"2605.15697","kind":"arxiv","version":1}},"canonical_sha256":"72dfedb20e2f167c16845fb06add1b4c9c911b84523986ca347a5614f516c260","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"72dfedb20e2f167c16845fb06add1b4c9c911b84523986ca347a5614f516c260","first_computed_at":"2026-05-20T00:01:13.055965Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:01:13.055965Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"2by972SWoFZG9M3sHmJwVuGAFHN+3ReSQCoLgJogesi/iCh3dJuikJthd5lOSH9EaoN5SS0xKn/BBbWmHM5NBQ==","signature_status":"signed_v1","signed_at":"2026-05-20T00:01:13.057843Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.15697","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:ed625b064a55abe0c3a46514659417512851df7e85cfd73388e656388ed224af","sha256:000d6fdffccd3eb3f2ce59049f7d028c07e8b12bf8b81daabd5c66c6521063de"],"state_sha256":"bfe2dcea35e3bf30aa1f1bf19ed4a3be5b2fcd6508d2bfc448b95a0ab1861037"}