{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:SQGZQU75X55JQ6S365V6ZNFFRM","short_pith_number":"pith:SQGZQU75","canonical_record":{"source":{"id":"2605.15333","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-14T18:56:06Z","cross_cats_sorted":[],"title_canon_sha256":"0597d612385480c03d08975004f6e3a92d19fcdb4f7292a1df15833cff627cba","abstract_canon_sha256":"d662ad636d5ee04fb171da90b0f4f41463ca9c2c8e4a3e6e7f2e8a06c970b979"},"schema_version":"1.0"},"canonical_sha256":"940d9853fdbf7a987a5bf76becb4a58b386eb15069f5e22786c38aa1a7f42569","source":{"kind":"arxiv","id":"2605.15333","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.15333","created_at":"2026-05-20T00:00:53Z"},{"alias_kind":"arxiv_version","alias_value":"2605.15333v1","created_at":"2026-05-20T00:00:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.15333","created_at":"2026-05-20T00:00:53Z"},{"alias_kind":"pith_short_12","alias_value":"SQGZQU75X55J","created_at":"2026-05-20T00:00:53Z"},{"alias_kind":"pith_short_16","alias_value":"SQGZQU75X55JQ6S3","created_at":"2026-05-20T00:00:53Z"},{"alias_kind":"pith_short_8","alias_value":"SQGZQU75","created_at":"2026-05-20T00:00:53Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:SQGZQU75X55JQ6S365V6ZNFFRM","target":"record","payload":{"canonical_record":{"source":{"id":"2605.15333","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-14T18:56:06Z","cross_cats_sorted":[],"title_canon_sha256":"0597d612385480c03d08975004f6e3a92d19fcdb4f7292a1df15833cff627cba","abstract_canon_sha256":"d662ad636d5ee04fb171da90b0f4f41463ca9c2c8e4a3e6e7f2e8a06c970b979"},"schema_version":"1.0"},"canonical_sha256":"940d9853fdbf7a987a5bf76becb4a58b386eb15069f5e22786c38aa1a7f42569","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:00:53.022008Z","signature_b64":"+OaX44UrftD/oUv8vPLnidvLk2EmFJ2o2ZvueD/X8Prqt+Jy9c4Nf7KTvqX42nRNvCB2n8Pp52V99YeRHX/mAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"940d9853fdbf7a987a5bf76becb4a58b386eb15069f5e22786c38aa1a7f42569","last_reissued_at":"2026-05-20T00:00:53.021274Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:00:53.021274Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.15333","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:00:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"urpfC2aor1aqHXpzNWZnruMJMaWIiZHzGdBQCMw/mQoh5WQZFl05FUmzol25aPjvKnsIT7YS+Mc5Lq+kDFqZAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-09T15:38:27.179922Z"},"content_sha256":"097c0b51e0832a3bff76faf0fb99bae453f58e85983bb89e0061cbee338f550c","schema_version":"1.0","event_id":"sha256:097c0b51e0832a3bff76faf0fb99bae453f58e85983bb89e0061cbee338f550c"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:SQGZQU75X55JQ6S365V6ZNFFRM","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Zero-Shot Goal Recognition with Large Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Large language models show uneven competence at recognizing goals from action observations, with some improving as evidence grows and others staying fixed to prior knowledge.","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Felipe Meneguzzi, Kin Max Piamolini Gusm\\~ao, Nathan Gavenski, Nir Oren","submitted_at":"2026-05-14T18:56:06Z","abstract_excerpt":"Large language models have recently reached near-parity with classical planners on well-known planning domains, yet this competence relies on world-knowledge exploitation rather than genuine symbolic reasoning. Goal recognition is a complementary abductive task structurally better suited to LLM strengths: it consists of evaluating consistency with world knowledge rather than generating novel action sequences. This paper provides the first systematic zero-shot evaluation of frontier LLMs as goal recognisers on key classical PDDL benchmarks. Our results show that LLM competence on goal recogniti"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Our results show that LLM competence on goal recognition is uneven: some models scale with evidence and approach landmark-based accuracy at full observations, while others remain anchored to world-knowledge priors regardless of how much evidence accumulates.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The qualitative analysis of reasoning traces accurately identifies a fundamental difference in evidence integration rather than artifacts of prompting, model size, or domain-specific familiarity.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Frontier LLMs show uneven zero-shot performance on goal recognition in PDDL domains: some scale with accumulating evidence toward landmark-based accuracy while others stay anchored to world-knowledge priors.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Large language models show uneven competence at recognizing goals from action observations, with some improving as evidence grows and others staying fixed to prior knowledge.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"d6d925804eea69129f3a60bc081091b1219bc0a11dd91942ef353381080043bb"},"source":{"id":"2605.15333","kind":"arxiv","version":1},"verdict":{"id":"47be6849-3c5a-4d22-8eb7-7e081741ec3f","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-19T16:04:41.062824Z","strongest_claim":"Our results show that LLM competence on goal recognition is uneven: some models scale with evidence and approach landmark-based accuracy at full observations, while others remain anchored to world-knowledge priors regardless of how much evidence accumulates.","one_line_summary":"Frontier LLMs show uneven zero-shot performance on goal recognition in PDDL domains: some scale with accumulating evidence toward landmark-based accuracy while others stay anchored to world-knowledge priors.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The qualitative analysis of reasoning traces accurately identifies a fundamental difference in evidence integration rather than artifacts of prompting, model size, or domain-specific familiarity.","pith_extraction_headline":"Large language models show uneven competence at recognizing goals from action observations, with some improving as evidence grows and others staying fixed to prior knowledge."},"integrity":{"clean":false,"summary":{"advisory":1,"critical":0,"by_detector":{"doi_compliance":{"total":1,"advisory":1,"critical":0,"informational":0}},"informational":0},"endpoint":"/pith/2605.15333/integrity.json","findings":[{"note":"DOI in the printed bibliography is fragmented by whitespace or line breaks. A longer candidate (10.48550/arxiv.2511) was visible in the surrounding text but could not be confirmed against doi.org as printed.","detector":"doi_compliance","severity":"advisory","ref_index":3,"audited_at":"2026-05-19T16:15:58.833197Z","detected_doi":"10.48550/arxiv.2511","finding_type":"recoverable_identifier","verdict_class":"incontrovertible","detected_arxiv_id":null}],"available":true,"detectors_run":[{"name":"doi_title_agreement","ran_at":"2026-05-19T16:31:18.289951Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T16:15:58.833197Z","status":"completed","version":"1.0.0","findings_count":1},{"name":"claim_evidence","ran_at":"2026-05-19T14:41:54.191100Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"ai_meta_artifact","ran_at":"2026-05-19T13:33:22.760559Z","status":"skipped","version":"1.0.0","findings_count":0}],"snapshot_sha256":"f0a44e24d0e42a42e5d1346413e21eb4fa790bafdbaeae801edf5f55b1ffd7ec"},"references":{"count":33,"sample":[{"doi":"","year":1994,"title":"The computational complexity of propositional strips planning.Artificial Intelligence, 69(1-2):165–204, 1994","work_id":"d09183ae-18e6-4006-9446-3259f70c0483","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.1609/icaps.v33i1.27237","year":2023,"title":"Goal recognition as a deep learning task: The grnet approach","work_id":"8cfdf6dd-c5d7-4b8a-9288-f8786969e232","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.48550/arxiv","year":2025,"title":"Frontier Large Language Models Rival State-of-the-Art Planners","work_id":"9713f11c-0c08-4158-8f46-f9a3d25d4085","ref_index":3,"cited_arxiv_id":"2511.09378","is_internal_anchor":true},{"doi":"10.1613/jair.1492","year":2004,"title":"Ordered landmarks in planning.Journal of Artificial Intelligence Research, 22(1):215–278, 2004","work_id":"ceb2dca8-7095-4975-9817-f2517a34b2b9","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Llms can’t plan, but can help planning in llm-modulo frameworks","work_id":"84a8bc82-1248-4ee2-a5ca-15d73cdc12be","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":33,"snapshot_sha256":"05367713d2729239844c2b7935ec20a1b9d1457af797084a12d2d87788a07117","internal_anchors":1},"formal_canon":{"evidence_count":2,"snapshot_sha256":"f8b6c8df0e3c9da255248d1a3a79abd9a4acc0d9f4a5e6a4dc14bbd0891dce74"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"47be6849-3c5a-4d22-8eb7-7e081741ec3f"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:00:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"FnXN1DqmGKYQ1ZvxvRl62DYH20L7LT0hn6iVGr1R2Cq4/C7jiqUhBTUGEaojqo+cMCVJAG6xjM0eRmZ7V/j/Dw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-09T15:38:27.180754Z"},"content_sha256":"dc060933acce8227dc085f4b3b54f3d06121a1f3eb85551abf8cfe78be1cfc02","schema_version":"1.0","event_id":"sha256:dc060933acce8227dc085f4b3b54f3d06121a1f3eb85551abf8cfe78be1cfc02"},{"event_type":"integrity_finding","subject_pith_number":"pith:2026:SQGZQU75X55JQ6S365V6ZNFFRM","target":"integrity","payload":{"note":"DOI in the printed bibliography is fragmented by whitespace or line breaks. A longer candidate (10.48550/arxiv.2511) was visible in the surrounding text but could not be confirmed against doi.org as printed.","snippet":"Augusto B. Corrêa, André Grahl Pereira, and Jendrik Seipp. The 2025 planning performance of frontier large language models.CoRR, abs/2511.09378, 2025. doi: 10.48550/ARXIV .2511. 09378","arxiv_id":"2605.15333","detector":"doi_compliance","evidence":{"ref_index":3,"verdict_class":"incontrovertible","resolved_title":null,"printed_excerpt":"Augusto B. Corrêa, André Grahl Pereira, and Jendrik Seipp. The 2025 planning performance of frontier large language models.CoRR, abs/2511.09378, 2025. doi: 10.48550/ARXIV .2511. 09378","reconstructed_doi":"10.48550/arxiv.2511"},"severity":"advisory","ref_index":3,"audited_at":"2026-05-19T16:15:58.833197Z","event_type":"pith.integrity.v1","detected_doi":"10.48550/arxiv.2511","detector_url":"https://pith.science/pith-integrity-protocol#doi_compliance","external_url":null,"finding_type":"recoverable_identifier","evidence_hash":"1e3eac30fae9205da06689aaa9151dbb118b1fa8a9997c7405eaa6df8759d084","paper_version":1,"verdict_class":"incontrovertible","resolved_title":null,"detector_version":"1.0.0","detected_arxiv_id":null,"integrity_event_id":1993,"payload_sha256":"1b66209297a80311ec7812e293963068f0ed6aac131093d174e4e38d888f01e4","signature_b64":"Kuy39G8UAntfPTNcgH3j2HWGfSdu+cNvw2n4Or6Q2s7b0BdTtRi3o4XMU04DYXqjNsZA9DCYQs9/y24ot6AGBA==","signing_key_id":"pith-v1-2026-05"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-19T16:17:12Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"ZE0sfowtES2XDZ7nwZJY8cyU1gSxjV+a1cmt6rq0y/mAGZOjiB83Hqln1BbWx2RoVPM4Qc5DXIXmrnIO1kYMCw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-09T15:38:27.181646Z"},"content_sha256":"92cf76b99e8c4ed6252e1effe067f4719cd2beab5b7f876b4da503960ff75506","schema_version":"1.0","event_id":"sha256:92cf76b99e8c4ed6252e1effe067f4719cd2beab5b7f876b4da503960ff75506"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/SQGZQU75X55JQ6S365V6ZNFFRM/bundle.json","state_url":"https://pith.science/pith/SQGZQU75X55JQ6S365V6ZNFFRM/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/SQGZQU75X55JQ6S365V6ZNFFRM/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-09T15:38:27Z","links":{"resolver":"https://pith.science/pith/SQGZQU75X55JQ6S365V6ZNFFRM","bundle":"https://pith.science/pith/SQGZQU75X55JQ6S365V6ZNFFRM/bundle.json","state":"https://pith.science/pith/SQGZQU75X55JQ6S365V6ZNFFRM/state.json","well_known_bundle":"https://pith.science/.well-known/pith/SQGZQU75X55JQ6S365V6ZNFFRM/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:SQGZQU75X55JQ6S365V6ZNFFRM","merge_version":"pith-open-graph-merge-v1","event_count":3,"valid_event_count":3,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"d662ad636d5ee04fb171da90b0f4f41463ca9c2c8e4a3e6e7f2e8a06c970b979","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-14T18:56:06Z","title_canon_sha256":"0597d612385480c03d08975004f6e3a92d19fcdb4f7292a1df15833cff627cba"},"schema_version":"1.0","source":{"id":"2605.15333","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.15333","created_at":"2026-05-20T00:00:53Z"},{"alias_kind":"arxiv_version","alias_value":"2605.15333v1","created_at":"2026-05-20T00:00:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.15333","created_at":"2026-05-20T00:00:53Z"},{"alias_kind":"pith_short_12","alias_value":"SQGZQU75X55J","created_at":"2026-05-20T00:00:53Z"},{"alias_kind":"pith_short_16","alias_value":"SQGZQU75X55JQ6S3","created_at":"2026-05-20T00:00:53Z"},{"alias_kind":"pith_short_8","alias_value":"SQGZQU75","created_at":"2026-05-20T00:00:53Z"}],"graph_snapshots":[{"event_id":"sha256:dc060933acce8227dc085f4b3b54f3d06121a1f3eb85551abf8cfe78be1cfc02","target":"graph","created_at":"2026-05-20T00:00:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Our results show that LLM competence on goal recognition is uneven: some models scale with evidence and approach landmark-based accuracy at full observations, while others remain anchored to world-knowledge priors regardless of how much evidence accumulates."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The qualitative analysis of reasoning traces accurately identifies a fundamental difference in evidence integration rather than artifacts of prompting, model size, or domain-specific familiarity."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Frontier LLMs show uneven zero-shot performance on goal recognition in PDDL domains: some scale with accumulating evidence toward landmark-based accuracy while others stay anchored to world-knowledge priors."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Large language models show uneven competence at recognizing goals from action observations, with some improving as evidence grows and others staying fixed to prior knowledge."}],"snapshot_sha256":"d6d925804eea69129f3a60bc081091b1219bc0a11dd91942ef353381080043bb"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"f8b6c8df0e3c9da255248d1a3a79abd9a4acc0d9f4a5e6a4dc14bbd0891dce74"},"integrity":{"available":true,"clean":false,"detectors_run":[{"findings_count":0,"name":"doi_title_agreement","ran_at":"2026-05-19T16:31:18.289951Z","status":"completed","version":"1.0.0"},{"findings_count":1,"name":"doi_compliance","ran_at":"2026-05-19T16:15:58.833197Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"claim_evidence","ran_at":"2026-05-19T14:41:54.191100Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"ai_meta_artifact","ran_at":"2026-05-19T13:33:22.760559Z","status":"skipped","version":"1.0.0"}],"endpoint":"/pith/2605.15333/integrity.json","findings":[{"audited_at":"2026-05-19T16:15:58.833197Z","detected_arxiv_id":null,"detected_doi":"10.48550/arxiv.2511","detector":"doi_compliance","finding_type":"recoverable_identifier","note":"DOI in the printed bibliography is fragmented by whitespace or line breaks. A longer candidate (10.48550/arxiv.2511) was visible in the surrounding text but could not be confirmed against doi.org as printed.","ref_index":3,"severity":"advisory","verdict_class":"incontrovertible"}],"snapshot_sha256":"f0a44e24d0e42a42e5d1346413e21eb4fa790bafdbaeae801edf5f55b1ffd7ec","summary":{"advisory":1,"by_detector":{"doi_compliance":{"advisory":1,"critical":0,"informational":0,"total":1}},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Large language models have recently reached near-parity with classical planners on well-known planning domains, yet this competence relies on world-knowledge exploitation rather than genuine symbolic reasoning. Goal recognition is a complementary abductive task structurally better suited to LLM strengths: it consists of evaluating consistency with world knowledge rather than generating novel action sequences. This paper provides the first systematic zero-shot evaluation of frontier LLMs as goal recognisers on key classical PDDL benchmarks. Our results show that LLM competence on goal recogniti","authors_text":"Felipe Meneguzzi, Kin Max Piamolini Gusm\\~ao, Nathan Gavenski, Nir Oren","cross_cats":[],"headline":"Large language models show uneven competence at recognizing goals from action observations, with some improving as evidence grows and others staying fixed to prior knowledge.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-14T18:56:06Z","title":"Zero-Shot Goal Recognition with Large Language Models"},"references":{"count":33,"internal_anchors":1,"resolved_work":33,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"The computational complexity of propositional strips planning.Artificial Intelligence, 69(1-2):165–204, 1994","work_id":"d09183ae-18e6-4006-9446-3259f70c0483","year":1994},{"cited_arxiv_id":"","doi":"10.1609/icaps.v33i1.27237","is_internal_anchor":false,"ref_index":2,"title":"Goal recognition as a deep learning task: The grnet approach","work_id":"8cfdf6dd-c5d7-4b8a-9288-f8786969e232","year":2023},{"cited_arxiv_id":"2511.09378","doi":"10.48550/arxiv","is_internal_anchor":true,"ref_index":3,"title":"Frontier Large Language Models Rival State-of-the-Art Planners","work_id":"9713f11c-0c08-4158-8f46-f9a3d25d4085","year":2025},{"cited_arxiv_id":"","doi":"10.1613/jair.1492","is_internal_anchor":false,"ref_index":4,"title":"Ordered landmarks in planning.Journal of Artificial Intelligence Research, 22(1):215–278, 2004","work_id":"ceb2dca8-7095-4975-9817-f2517a34b2b9","year":2004},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Llms can’t plan, but can help planning in llm-modulo frameworks","work_id":"84a8bc82-1248-4ee2-a5ca-15d73cdc12be","year":2024}],"snapshot_sha256":"05367713d2729239844c2b7935ec20a1b9d1457af797084a12d2d87788a07117"},"source":{"id":"2605.15333","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-19T16:04:41.062824Z","id":"47be6849-3c5a-4d22-8eb7-7e081741ec3f","model_set":{"reader":"grok-4.3"},"one_line_summary":"Frontier LLMs show uneven zero-shot performance on goal recognition in PDDL domains: some scale with accumulating evidence toward landmark-based accuracy while others stay anchored to world-knowledge priors.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Large language models show uneven competence at recognizing goals from action observations, with some improving as evidence grows and others staying fixed to prior knowledge.","strongest_claim":"Our results show that LLM competence on goal recognition is uneven: some models scale with evidence and approach landmark-based accuracy at full observations, while others remain anchored to world-knowledge priors regardless of how much evidence accumulates.","weakest_assumption":"The qualitative analysis of reasoning traces accurately identifies a fundamental difference in evidence integration rather than artifacts of prompting, model size, or domain-specific familiarity."}},"verdict_id":"47be6849-3c5a-4d22-8eb7-7e081741ec3f"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:097c0b51e0832a3bff76faf0fb99bae453f58e85983bb89e0061cbee338f550c","target":"record","created_at":"2026-05-20T00:00:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"d662ad636d5ee04fb171da90b0f4f41463ca9c2c8e4a3e6e7f2e8a06c970b979","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-05-14T18:56:06Z","title_canon_sha256":"0597d612385480c03d08975004f6e3a92d19fcdb4f7292a1df15833cff627cba"},"schema_version":"1.0","source":{"id":"2605.15333","kind":"arxiv","version":1}},"canonical_sha256":"940d9853fdbf7a987a5bf76becb4a58b386eb15069f5e22786c38aa1a7f42569","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"940d9853fdbf7a987a5bf76becb4a58b386eb15069f5e22786c38aa1a7f42569","first_computed_at":"2026-05-20T00:00:53.021274Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:00:53.021274Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"+OaX44UrftD/oUv8vPLnidvLk2EmFJ2o2ZvueD/X8Prqt+Jy9c4Nf7KTvqX42nRNvCB2n8Pp52V99YeRHX/mAA==","signature_status":"signed_v1","signed_at":"2026-05-20T00:00:53.022008Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.15333","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:92cf76b99e8c4ed6252e1effe067f4719cd2beab5b7f876b4da503960ff75506","sha256:097c0b51e0832a3bff76faf0fb99bae453f58e85983bb89e0061cbee338f550c","sha256:dc060933acce8227dc085f4b3b54f3d06121a1f3eb85551abf8cfe78be1cfc02"],"state_sha256":"d5cec8448ce69501718cadf3072278c79eb2d05ecdaffa29f9541db85bd73062"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"k8C1kBnIlVcb684djDHovUslJpU0Fr3nmrjHWbgoOlKIwWJTSaRHBZ4SrRbq8gJYz/UXQDnv+1ClvFu9STM/DA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-09T15:38:27.184493Z","bundle_sha256":"74a5c95b312cefa9a03196cdc0c39bc3bbec1c87300a4ed26e7d241275b18f48"}}