{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:RZQPDAUOJVPQQRMAN53XM22F27","short_pith_number":"pith:RZQPDAUO","schema_version":"1.0","canonical_sha256":"8e60f1828e4d5f0845806f77766b45d7f19db58bd0ca245800beb9d4a0b2c81d","source":{"kind":"arxiv","id":"2511.09378","version":2},"attestation_state":"computed","paper":{"title":"Frontier Large Language Models Rival State-of-the-Art Planners","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.AI","authors_text":"Andr\\'e G. Pereira, Augusto B. Corr\\^ea, Jendrik Seipp","submitted_at":"2025-11-12T14:45:07Z","abstract_excerpt":"A series of influential studies established that large language models cannot reliably solve even simple planning tasks. We show that the latest generation of frontier models overturns this conclusion. We evaluate three families of frontier LLMs on a challenging set of planning tasks based on the most recent International Planning Competition following rigorous evaluation guidelines: solutions are verified with a validation tool, tasks are freshly created to avoid data contamination, and performance is compared against state-of-the-art classical planners. On standard task descriptions, Gemini "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2511.09378","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2025-11-12T14:45:07Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"a7200e26b74b0a78ba37006a78fdfb908ff03aa798e49f1fc8adb0fe97e23d5b","abstract_canon_sha256":"02901472aa99282c0859c80f49065fa68a98d19b87358566c678b906050b534c"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:01:36.982204Z","signature_b64":"VylNlqySYNKqEzxqZ5FpUVieZNzmEkMhFoK43jMHjcXTbT1teuSCNGjt4FPRhZpQ1CTUJHjlgp5ktkc1wQquDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8e60f1828e4d5f0845806f77766b45d7f19db58bd0ca245800beb9d4a0b2c81d","last_reissued_at":"2026-05-20T00:01:36.981553Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:01:36.981553Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Frontier Large Language Models Rival State-of-the-Art Planners","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"cs.AI","authors_text":"Andr\\'e G. Pereira, Augusto B. Corr\\^ea, Jendrik Seipp","submitted_at":"2025-11-12T14:45:07Z","abstract_excerpt":"A series of influential studies established that large language models cannot reliably solve even simple planning tasks. We show that the latest generation of frontier models overturns this conclusion. We evaluate three families of frontier LLMs on a challenging set of planning tasks based on the most recent International Planning Competition following rigorous evaluation guidelines: solutions are verified with a validation tool, tasks are freshly created to avoid data contamination, and performance is compared against state-of-the-art classical planners. On standard task descriptions, Gemini "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2511.09378","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2511.09378/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2511.09378","created_at":"2026-05-20T00:01:36.981638+00:00"},{"alias_kind":"arxiv_version","alias_value":"2511.09378v2","created_at":"2026-05-20T00:01:36.981638+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2511.09378","created_at":"2026-05-20T00:01:36.981638+00:00"},{"alias_kind":"pith_short_12","alias_value":"RZQPDAUOJVPQ","created_at":"2026-05-20T00:01:36.981638+00:00"},{"alias_kind":"pith_short_16","alias_value":"RZQPDAUOJVPQQRMA","created_at":"2026-05-20T00:01:36.981638+00:00"},{"alias_kind":"pith_short_8","alias_value":"RZQPDAUO","created_at":"2026-05-20T00:01:36.981638+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":3,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2605.16142","citing_title":"Property-Guided LLM Program Synthesis for Planning","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15333","citing_title":"Zero-Shot Goal Recognition with Large Language Models","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2603.02070","citing_title":"Exploring Plan Space through Conversation: An Agentic Framework for LLM-Mediated Explanations in Planning","ref_index":8,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/RZQPDAUOJVPQQRMAN53XM22F27","json":"https://pith.science/pith/RZQPDAUOJVPQQRMAN53XM22F27.json","graph_json":"https://pith.science/api/pith-number/RZQPDAUOJVPQQRMAN53XM22F27/graph.json","events_json":"https://pith.science/api/pith-number/RZQPDAUOJVPQQRMAN53XM22F27/events.json","paper":"https://pith.science/paper/RZQPDAUO"},"agent_actions":{"view_html":"https://pith.science/pith/RZQPDAUOJVPQQRMAN53XM22F27","download_json":"https://pith.science/pith/RZQPDAUOJVPQQRMAN53XM22F27.json","view_paper":"https://pith.science/paper/RZQPDAUO","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2511.09378&json=true","fetch_graph":"https://pith.science/api/pith-number/RZQPDAUOJVPQQRMAN53XM22F27/graph.json","fetch_events":"https://pith.science/api/pith-number/RZQPDAUOJVPQQRMAN53XM22F27/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/RZQPDAUOJVPQQRMAN53XM22F27/action/timestamp_anchor","attest_storage":"https://pith.science/pith/RZQPDAUOJVPQQRMAN53XM22F27/action/storage_attestation","attest_author":"https://pith.science/pith/RZQPDAUOJVPQQRMAN53XM22F27/action/author_attestation","sign_citation":"https://pith.science/pith/RZQPDAUOJVPQQRMAN53XM22F27/action/citation_signature","submit_replication":"https://pith.science/pith/RZQPDAUOJVPQQRMAN53XM22F27/action/replication_record"}},"created_at":"2026-05-20T00:01:36.981638+00:00","updated_at":"2026-05-20T00:01:36.981638+00:00"}