{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:OLAS332TXBRXFCGYRSHC45VXEQ","short_pith_number":"pith:OLAS332T","canonical_record":{"source":{"id":"2510.02837","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2025-10-03T09:19:15Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"2e25b739daeae5867ec6f808ee681bf94f7d47427bd972ace1eddc3351f203b8","abstract_canon_sha256":"1d9f4e9577adb4af5503e56f5a4279ebc07860a137d2d96ecaeed74cabfc4b01"},"schema_version":"1.0"},"canonical_sha256":"72c12def53b8637288d88c8e2e76b72418e936b25075068f43b29046a69a7032","source":{"kind":"arxiv","id":"2510.02837","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2510.02837","created_at":"2026-05-17T23:39:04Z"},{"alias_kind":"arxiv_version","alias_value":"2510.02837v2","created_at":"2026-05-17T23:39:04Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.02837","created_at":"2026-05-17T23:39:04Z"},{"alias_kind":"pith_short_12","alias_value":"OLAS332TXBRX","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"OLAS332TXBRXFCGY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"OLAS332T","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:OLAS332TXBRXFCGYRSHC45VXEQ","target":"record","payload":{"canonical_record":{"source":{"id":"2510.02837","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2025-10-03T09:19:15Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"2e25b739daeae5867ec6f808ee681bf94f7d47427bd972ace1eddc3351f203b8","abstract_canon_sha256":"1d9f4e9577adb4af5503e56f5a4279ebc07860a137d2d96ecaeed74cabfc4b01"},"schema_version":"1.0"},"canonical_sha256":"72c12def53b8637288d88c8e2e76b72418e936b25075068f43b29046a69a7032","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:04.728434Z","signature_b64":"F8+cPHh8R4X9u5UoIuLNVZmf5qY1oH8Gq0uNg4WSo7685iWEPqe3xQsiPq0dhQCyJUBgPVBLhb0YYwEpa3ecBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"72c12def53b8637288d88c8e2e76b72418e936b25075068f43b29046a69a7032","last_reissued_at":"2026-05-17T23:39:04.727867Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:04.727867Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2510.02837","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:04Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"SJ/Gj8V7/WOGcP7+JP9rvECzk2ZgreBbK6KUCa6wUly8SpIc0346A99CrAPZlI7WApZFjk4l4ntB5gTtngpRDg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-20T04:35:49.454378Z"},"content_sha256":"e2e47bf6ddbfd4bd49d5ebe81946bcbf1592bf05b181ad860f6573c98af547c0","schema_version":"1.0","event_id":"sha256:e2e47bf6ddbfd4bd49d5ebe81946bcbf1592bf05b181ad860f6573c98af547c0"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:OLAS332TXBRXFCGYRSHC45VXEQ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Beyond the Final Answer: Evaluating the Reasoning Trajectories of Tool-Augmented Agents","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.AI","authors_text":"Chanyoung Park, Dongha Lee, Sangwu Park, Sein Kim, Wonjoong Kim, Yeonjun In","submitted_at":"2025-10-03T09:19:15Z","abstract_excerpt":"Although recent tool-augmented benchmarks involve complex requests, evaluation remains limited to answer matching, neglecting critical trajectory aspects like efficiency, hallucination, and adaptivity. The most straightforward method for evaluation is to compare an agent's trajectory with the ground-truth, but annotating all valid ground-truth trajectories is prohibitively expensive. In this manner, we introduce TRACE, a reference-free framework for the multi-dimensional evaluation of tool-augmented LLMs. By incorporating an evidence bank which accumulates knowledge from preceding steps, TRACE"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2510.02837","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:04Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"4CePM+Fp/ZBqhkCYfDbNvhIUynVeDvwPAZcRHwLBaQVERlaJ4AqEPMv6Myg5mesQZZR7N4COm01AkUGGVVM+AQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-20T04:35:49.455187Z"},"content_sha256":"163f8b5f2269b953771a0d818fa5f410747033523b98d633cbf0493fd5f34446","schema_version":"1.0","event_id":"sha256:163f8b5f2269b953771a0d818fa5f410747033523b98d633cbf0493fd5f34446"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/OLAS332TXBRXFCGYRSHC45VXEQ/bundle.json","state_url":"https://pith.science/pith/OLAS332TXBRXFCGYRSHC45VXEQ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/OLAS332TXBRXFCGYRSHC45VXEQ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-20T04:35:49Z","links":{"resolver":"https://pith.science/pith/OLAS332TXBRXFCGYRSHC45VXEQ","bundle":"https://pith.science/pith/OLAS332TXBRXFCGYRSHC45VXEQ/bundle.json","state":"https://pith.science/pith/OLAS332TXBRXFCGYRSHC45VXEQ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/OLAS332TXBRXFCGYRSHC45VXEQ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:OLAS332TXBRXFCGYRSHC45VXEQ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"1d9f4e9577adb4af5503e56f5a4279ebc07860a137d2d96ecaeed74cabfc4b01","cross_cats_sorted":["cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2025-10-03T09:19:15Z","title_canon_sha256":"2e25b739daeae5867ec6f808ee681bf94f7d47427bd972ace1eddc3351f203b8"},"schema_version":"1.0","source":{"id":"2510.02837","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2510.02837","created_at":"2026-05-17T23:39:04Z"},{"alias_kind":"arxiv_version","alias_value":"2510.02837v2","created_at":"2026-05-17T23:39:04Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.02837","created_at":"2026-05-17T23:39:04Z"},{"alias_kind":"pith_short_12","alias_value":"OLAS332TXBRX","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"OLAS332TXBRXFCGY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"OLAS332T","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:163f8b5f2269b953771a0d818fa5f410747033523b98d633cbf0493fd5f34446","target":"graph","created_at":"2026-05-17T23:39:04Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Although recent tool-augmented benchmarks involve complex requests, evaluation remains limited to answer matching, neglecting critical trajectory aspects like efficiency, hallucination, and adaptivity. The most straightforward method for evaluation is to compare an agent's trajectory with the ground-truth, but annotating all valid ground-truth trajectories is prohibitively expensive. In this manner, we introduce TRACE, a reference-free framework for the multi-dimensional evaluation of tool-augmented LLMs. By incorporating an evidence bank which accumulates knowledge from preceding steps, TRACE","authors_text":"Chanyoung Park, Dongha Lee, Sangwu Park, Sein Kim, Wonjoong Kim, Yeonjun In","cross_cats":["cs.CL"],"headline":"","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2025-10-03T09:19:15Z","title":"Beyond the Final Answer: Evaluating the Reasoning Trajectories of Tool-Augmented Agents"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2510.02837","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:e2e47bf6ddbfd4bd49d5ebe81946bcbf1592bf05b181ad860f6573c98af547c0","target":"record","created_at":"2026-05-17T23:39:04Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"1d9f4e9577adb4af5503e56f5a4279ebc07860a137d2d96ecaeed74cabfc4b01","cross_cats_sorted":["cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2025-10-03T09:19:15Z","title_canon_sha256":"2e25b739daeae5867ec6f808ee681bf94f7d47427bd972ace1eddc3351f203b8"},"schema_version":"1.0","source":{"id":"2510.02837","kind":"arxiv","version":2}},"canonical_sha256":"72c12def53b8637288d88c8e2e76b72418e936b25075068f43b29046a69a7032","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"72c12def53b8637288d88c8e2e76b72418e936b25075068f43b29046a69a7032","first_computed_at":"2026-05-17T23:39:04.727867Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:04.727867Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"F8+cPHh8R4X9u5UoIuLNVZmf5qY1oH8Gq0uNg4WSo7685iWEPqe3xQsiPq0dhQCyJUBgPVBLhb0YYwEpa3ecBQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:04.728434Z","signed_message":"canonical_sha256_bytes"},"source_id":"2510.02837","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:e2e47bf6ddbfd4bd49d5ebe81946bcbf1592bf05b181ad860f6573c98af547c0","sha256:163f8b5f2269b953771a0d818fa5f410747033523b98d633cbf0493fd5f34446"],"state_sha256":"e0c0496d90a8e5a8c69ca0877ff5c3ffc246366e162d20632ccd43fe4622b033"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"y/SF/GUvjHUJC/WHa+jQaRSnumh/8prVaeH4JmIG0ARc6bd4RkH2Iph2dgqJO7CaPzm9opTs+RkCMsDRTd90DA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-20T04:35:49.459241Z","bundle_sha256":"9a8e91600b55d8e2569083664a9a943964b2b5f4d68e7bd3fc4dd3232ee917eb"}}