{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:ZYBOZ7ADA2AP6FTVJDMND5NFE7","short_pith_number":"pith:ZYBOZ7AD","canonical_record":{"source":{"id":"2605.14477","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T07:18:12Z","cross_cats_sorted":[],"title_canon_sha256":"c3cbb1f626339a3780999d67594f72d96efa060e1da9d5f0f43ec8ac47f0aba2","abstract_canon_sha256":"41bb27f962974400196d990a6a47b421f1e629498ca694ae56ecd50b45d58c8a"},"schema_version":"1.0"},"canonical_sha256":"ce02ecfc030680ff167548d8d1f5a527d8c51f850c3d2a2d3540572e1bc4288d","source":{"kind":"arxiv","id":"2605.14477","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14477","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14477v1","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14477","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"pith_short_12","alias_value":"ZYBOZ7ADA2AP","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"ZYBOZ7ADA2AP6FTV","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"ZYBOZ7AD","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:ZYBOZ7ADA2AP6FTVJDMND5NFE7","target":"record","payload":{"canonical_record":{"source":{"id":"2605.14477","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T07:18:12Z","cross_cats_sorted":[],"title_canon_sha256":"c3cbb1f626339a3780999d67594f72d96efa060e1da9d5f0f43ec8ac47f0aba2","abstract_canon_sha256":"41bb27f962974400196d990a6a47b421f1e629498ca694ae56ecd50b45d58c8a"},"schema_version":"1.0"},"canonical_sha256":"ce02ecfc030680ff167548d8d1f5a527d8c51f850c3d2a2d3540572e1bc4288d","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:06.589696Z","signature_b64":"ivN+4vKLZr7uh+Lv9h3/ZHAg9V/TTSlkNaW+nf2MqExy2D0lSjzoR8C9NAPklbmu8S3WPS1UJrTilXu/PJ9GCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ce02ecfc030680ff167548d8d1f5a527d8c51f850c3d2a2d3540572e1bc4288d","last_reissued_at":"2026-05-17T23:39:06.589040Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:06.589040Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.14477","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:06Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"jEV7tAApnjZ+WzQhQgUybGuI83P5yNaqsddHSMJORJs9hEaeaeuW4UqehT+gBGwTlVZFiqhZIheJiRJG0JmZAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T19:15:28.876278Z"},"content_sha256":"5e80887a61c348dd57318685a45851ef3c1fb7af676062998807e46fce719353","schema_version":"1.0","event_id":"sha256:5e80887a61c348dd57318685a45851ef3c1fb7af676062998807e46fce719353"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:ZYBOZ7ADA2AP6FTVJDMND5NFE7","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Test-Time Learning with an Evolving Library","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Large language models improve on complex reasoning by building and evolving a shared library of skills extracted from their own inference trajectories without any parameter updates or external supervision.","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Alessandro Sordoni, Chandan Singh, Jianfeng Gao, Michel Galley, Weijia Xu, Xingdi Yuan, Zelalem Gero","submitted_at":"2026-05-14T07:18:12Z","abstract_excerpt":"We introduce EvoLib, a test-time learning framework that enables large language models to accumulate, reuse, and evolve knowledge across problem instances without parameter updates or external supervision. Instead of adapting model parameters, our approach maintains a shared library of knowledge abstractions, including modular skills and reflective insights, automatically extracted from the model's own inference trajectories. To support continual improvement, we introduce a principled weighting and consolidation mechanism that jointly optimizes for immediate utility and long-term value. This a"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Across challenging benchmarks in mathematical reasoning, code generation, and multi-turn agentic environments, EvoLib improves substantially over the top test-time scaling and learning methods without ground-truth feedback.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That modular skills and reflective insights automatically extracted from the model's own inference trajectories can be weighted and consolidated into increasingly general and reusable abstractions that deliver long-term value without any external supervision or ground-truth signals.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"EvoLib enables LLMs to accumulate, reuse, and evolve knowledge abstractions from inference trajectories at test time, yielding substantial gains on math reasoning, code generation, and agentic benchmarks without parameter updates or supervision.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Large language models improve on complex reasoning by building and evolving a shared library of skills extracted from their own inference trajectories without any parameter updates or external supervision.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"a33bef23ae6ae46d68a7a5d9e1d9056530e90833779f016152a78e7134648fb9"},"source":{"id":"2605.14477","kind":"arxiv","version":1},"verdict":{"id":"76db1182-5dea-471e-806f-2195e6f1bd54","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T01:39:42.903593Z","strongest_claim":"Across challenging benchmarks in mathematical reasoning, code generation, and multi-turn agentic environments, EvoLib improves substantially over the top test-time scaling and learning methods without ground-truth feedback.","one_line_summary":"EvoLib enables LLMs to accumulate, reuse, and evolve knowledge abstractions from inference trajectories at test time, yielding substantial gains on math reasoning, code generation, and agentic benchmarks without parameter updates or supervision.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That modular skills and reflective insights automatically extracted from the model's own inference trajectories can be weighted and consolidated into increasingly general and reusable abstractions that deliver long-term value without any external supervision or ground-truth signals.","pith_extraction_headline":"Large language models improve on complex reasoning by building and evolving a shared library of skills extracted from their own inference trajectories without any parameter updates or external supervision."},"references":{"count":45,"sample":[{"doi":"","year":2023,"title":"Chi, Sharan Narang, Aakanksha Chowdhery, and Denny Zhou","work_id":"5d4e8522-a5a1-4949-89a6-d2d315b3400d","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Large language models are better reasoners with self-verification","work_id":"47c4f4f1-5fef-4ea3-9420-d7c113b66ada","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters","work_id":"a8d50b24-bdf5-46ed-bc4f-2927dfd81f1d","ref_index":3,"cited_arxiv_id":"2408.03314","is_internal_anchor":true},{"doi":"","year":2026,"title":"Test-time recursive thinking: Self-improvement without external feedback","work_id":"e6ec4065-502a-4845-8a5f-c68221d08a55","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"s1: Simple test-time scaling","work_id":"63c57ea7-aec3-4070-9865-c9f2b796d7ef","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":45,"snapshot_sha256":"8548676349b0e16c160ef84cef93eadee2307c03a186c73f94f929cc010586d5","internal_anchors":12},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"76db1182-5dea-471e-806f-2195e6f1bd54"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:06Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"YlbyGChURadmhI0+Mz+ZKyE4fDgCYBX/JnVWQdlrJ6ezoBIC1g6v/j8OqAyPI2vRKyTyIKuPqjsduCzS+PFbDQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T19:15:28.877110Z"},"content_sha256":"92a018e1e0feeed6bdab9b16aae8ea3735deb7ff280f9fe4c8c4c3192c494a09","schema_version":"1.0","event_id":"sha256:92a018e1e0feeed6bdab9b16aae8ea3735deb7ff280f9fe4c8c4c3192c494a09"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/ZYBOZ7ADA2AP6FTVJDMND5NFE7/bundle.json","state_url":"https://pith.science/pith/ZYBOZ7ADA2AP6FTVJDMND5NFE7/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/ZYBOZ7ADA2AP6FTVJDMND5NFE7/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-25T19:15:28Z","links":{"resolver":"https://pith.science/pith/ZYBOZ7ADA2AP6FTVJDMND5NFE7","bundle":"https://pith.science/pith/ZYBOZ7ADA2AP6FTVJDMND5NFE7/bundle.json","state":"https://pith.science/pith/ZYBOZ7ADA2AP6FTVJDMND5NFE7/state.json","well_known_bundle":"https://pith.science/.well-known/pith/ZYBOZ7ADA2AP6FTVJDMND5NFE7/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:ZYBOZ7ADA2AP6FTVJDMND5NFE7","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"41bb27f962974400196d990a6a47b421f1e629498ca694ae56ecd50b45d58c8a","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T07:18:12Z","title_canon_sha256":"c3cbb1f626339a3780999d67594f72d96efa060e1da9d5f0f43ec8ac47f0aba2"},"schema_version":"1.0","source":{"id":"2605.14477","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14477","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14477v1","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14477","created_at":"2026-05-17T23:39:06Z"},{"alias_kind":"pith_short_12","alias_value":"ZYBOZ7ADA2AP","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"ZYBOZ7ADA2AP6FTV","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"ZYBOZ7AD","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:92a018e1e0feeed6bdab9b16aae8ea3735deb7ff280f9fe4c8c4c3192c494a09","target":"graph","created_at":"2026-05-17T23:39:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Across challenging benchmarks in mathematical reasoning, code generation, and multi-turn agentic environments, EvoLib improves substantially over the top test-time scaling and learning methods without ground-truth feedback."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That modular skills and reflective insights automatically extracted from the model's own inference trajectories can be weighted and consolidated into increasingly general and reusable abstractions that deliver long-term value without any external supervision or ground-truth signals."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"EvoLib enables LLMs to accumulate, reuse, and evolve knowledge abstractions from inference trajectories at test time, yielding substantial gains on math reasoning, code generation, and agentic benchmarks without parameter updates or supervision."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Large language models improve on complex reasoning by building and evolving a shared library of skills extracted from their own inference trajectories without any parameter updates or external supervision."}],"snapshot_sha256":"a33bef23ae6ae46d68a7a5d9e1d9056530e90833779f016152a78e7134648fb9"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"We introduce EvoLib, a test-time learning framework that enables large language models to accumulate, reuse, and evolve knowledge across problem instances without parameter updates or external supervision. Instead of adapting model parameters, our approach maintains a shared library of knowledge abstractions, including modular skills and reflective insights, automatically extracted from the model's own inference trajectories. To support continual improvement, we introduce a principled weighting and consolidation mechanism that jointly optimizes for immediate utility and long-term value. This a","authors_text":"Alessandro Sordoni, Chandan Singh, Jianfeng Gao, Michel Galley, Weijia Xu, Xingdi Yuan, Zelalem Gero","cross_cats":[],"headline":"Large language models improve on complex reasoning by building and evolving a shared library of skills extracted from their own inference trajectories without any parameter updates or external supervision.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T07:18:12Z","title":"Test-Time Learning with an Evolving Library"},"references":{"count":45,"internal_anchors":12,"resolved_work":45,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Chi, Sharan Narang, Aakanksha Chowdhery, and Denny Zhou","work_id":"5d4e8522-a5a1-4949-89a6-d2d315b3400d","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Large language models are better reasoners with self-verification","work_id":"47c4f4f1-5fef-4ea3-9420-d7c113b66ada","year":2023},{"cited_arxiv_id":"2408.03314","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters","work_id":"a8d50b24-bdf5-46ed-bc4f-2927dfd81f1d","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Test-time recursive thinking: Self-improvement without external feedback","work_id":"e6ec4065-502a-4845-8a5f-c68221d08a55","year":2026},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"s1: Simple test-time scaling","work_id":"63c57ea7-aec3-4070-9865-c9f2b796d7ef","year":2025}],"snapshot_sha256":"8548676349b0e16c160ef84cef93eadee2307c03a186c73f94f929cc010586d5"},"source":{"id":"2605.14477","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T01:39:42.903593Z","id":"76db1182-5dea-471e-806f-2195e6f1bd54","model_set":{"reader":"grok-4.3"},"one_line_summary":"EvoLib enables LLMs to accumulate, reuse, and evolve knowledge abstractions from inference trajectories at test time, yielding substantial gains on math reasoning, code generation, and agentic benchmarks without parameter updates or supervision.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Large language models improve on complex reasoning by building and evolving a shared library of skills extracted from their own inference trajectories without any parameter updates or external supervision.","strongest_claim":"Across challenging benchmarks in mathematical reasoning, code generation, and multi-turn agentic environments, EvoLib improves substantially over the top test-time scaling and learning methods without ground-truth feedback.","weakest_assumption":"That modular skills and reflective insights automatically extracted from the model's own inference trajectories can be weighted and consolidated into increasingly general and reusable abstractions that deliver long-term value without any external supervision or ground-truth signals."}},"verdict_id":"76db1182-5dea-471e-806f-2195e6f1bd54"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:5e80887a61c348dd57318685a45851ef3c1fb7af676062998807e46fce719353","target":"record","created_at":"2026-05-17T23:39:06Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"41bb27f962974400196d990a6a47b421f1e629498ca694ae56ecd50b45d58c8a","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T07:18:12Z","title_canon_sha256":"c3cbb1f626339a3780999d67594f72d96efa060e1da9d5f0f43ec8ac47f0aba2"},"schema_version":"1.0","source":{"id":"2605.14477","kind":"arxiv","version":1}},"canonical_sha256":"ce02ecfc030680ff167548d8d1f5a527d8c51f850c3d2a2d3540572e1bc4288d","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"ce02ecfc030680ff167548d8d1f5a527d8c51f850c3d2a2d3540572e1bc4288d","first_computed_at":"2026-05-17T23:39:06.589040Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:06.589040Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"ivN+4vKLZr7uh+Lv9h3/ZHAg9V/TTSlkNaW+nf2MqExy2D0lSjzoR8C9NAPklbmu8S3WPS1UJrTilXu/PJ9GCw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:06.589696Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.14477","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:5e80887a61c348dd57318685a45851ef3c1fb7af676062998807e46fce719353","sha256:92a018e1e0feeed6bdab9b16aae8ea3735deb7ff280f9fe4c8c4c3192c494a09"],"state_sha256":"da717969e3ab8410d7e8467ad8b7ece6060b257059769592e20f686975fbd78c"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"VANHGrWb7F0DvIS3a2xekQjhj1+tBO9vuAuShmg7Aut8aiIWlsiOz8NYMPpKHtaiXqqkjmweLrZ/wQijZVv+CA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-25T19:15:28.880861Z","bundle_sha256":"ff1f0842ce4697c4467c5e8221ebff21f994572f76a12d0a5ecbb65d17606cce"}}