{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:66PDSP3G4NIBWALESZ62474ERC","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"415608be6766ee77a4cb4f46df39171debf05ed2bec5f2d79ef1a85aae0a2094","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CL","submitted_at":"2026-03-13T01:54:00Z","title_canon_sha256":"4258bd0dce48de8427f949f2bc374b2ab649c599f074aac85b393c968cf14279"},"schema_version":"1.0","source":{"id":"2603.12564","kind":"arxiv","version":8}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.12564","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"arxiv_version","alias_value":"2603.12564v8","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.12564","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"pith_short_12","alias_value":"66PDSP3G4NIB","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"pith_short_16","alias_value":"66PDSP3G4NIBWALE","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"pith_short_8","alias_value":"66PDSP3G","created_at":"2026-05-27T01:04:57Z"}],"graph_snapshots":[{"event_id":"sha256:599da38e8a4da28db00a616f8f242673b0e0c4b7c2de79c75a1f2973b0130acc","target":"graph","created_at":"2026-05-27T01:04:57Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Stronger models are not safer: the best-performing model has the highest quality score yet the worst suitability violations (99.1% of turns). This points to an alignment-grounding tension where faithful grounding in tool data makes the agent the most reliable executor of bad data."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the specific tool data manipulations used in the 23-turn replays are representative of realistic errors an agent might encounter, and that sparse autoencoder probing reliably indicates internal detection without corresponding output changes."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"LLM agents exhibit evaluation blindness in multi-turn financial advice, with stronger models showing up to 99.1% suitability violations when tool data is manipulated, as internal detection fails to produce safer outputs."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"LLM recommendation agents keep giving unsuitable financial advice when tool data is wrong, with stronger models violating suitability most often."}],"snapshot_sha256":"020377b5b1c04eca216475be1f778c205e986ce2833d1162b98fb718e1ac7371"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"a65b969fb1a0389f243f54732f47fa1b9f2a5743a7fc74566c2258701520cd7d"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2603.12564/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"People increasingly use LLM agents for multi-turn financial recommendations, where the agent pulls market data through tools and tracks user preferences across turns. When tool outputs are manipulated, the recommendations stop matching the user's stated risk profile, but because standard metrics like NDCG only score general relevance, risky and safe stocks score alike, so the metric says nothing went wrong. We call this gap evaluation blindness. We replay 23-turn financial advisory conversations across eight language models, running each dialogue twice with clean and manipulated tool data. Qua","authors_text":"Adriano Koshiyama, Maria Perez-Ortiz, Sahan Bulathwela, Zekun Wu","cross_cats":["cs.AI"],"headline":"LLM recommendation agents keep giving unsuitable financial advice when tool data is wrong, with stronger models violating suitability most often.","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CL","submitted_at":"2026-03-13T01:54:00Z","title":"Sell Me This Stock: Unsafe Recommendation Drift in LLM Agents"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2603.12564","kind":"arxiv","version":8},"verdict":{"created_at":"2026-05-15T12:30:40.597757Z","id":"e15f2287-6e37-4f94-94b9-4ba9c556a137","model_set":{"reader":"grok-4.3"},"one_line_summary":"LLM agents exhibit evaluation blindness in multi-turn financial advice, with stronger models showing up to 99.1% suitability violations when tool data is manipulated, as internal detection fails to produce safer outputs.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"LLM recommendation agents keep giving unsuitable financial advice when tool data is wrong, with stronger models violating suitability most often.","strongest_claim":"Stronger models are not safer: the best-performing model has the highest quality score yet the worst suitability violations (99.1% of turns). This points to an alignment-grounding tension where faithful grounding in tool data makes the agent the most reliable executor of bad data.","weakest_assumption":"That the specific tool data manipulations used in the 23-turn replays are representative of realistic errors an agent might encounter, and that sparse autoencoder probing reliably indicates internal detection without corresponding output changes."}},"verdict_id":"e15f2287-6e37-4f94-94b9-4ba9c556a137"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:9df2c9cbb26db83a3f36085704d16a04c09427bbd818ead563e8584ef92116ea","target":"record","created_at":"2026-05-27T01:04:57Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"415608be6766ee77a4cb4f46df39171debf05ed2bec5f2d79ef1a85aae0a2094","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CL","submitted_at":"2026-03-13T01:54:00Z","title_canon_sha256":"4258bd0dce48de8427f949f2bc374b2ab649c599f074aac85b393c968cf14279"},"schema_version":"1.0","source":{"id":"2603.12564","kind":"arxiv","version":8}},"canonical_sha256":"f79e393f66e3501b0164967dae7f8488b63c3004c84f583f65a5526574ac2e8a","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"f79e393f66e3501b0164967dae7f8488b63c3004c84f583f65a5526574ac2e8a","first_computed_at":"2026-05-27T01:04:57.105296Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-27T01:04:57.105296Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"4jskEN5IOoJ9BC5I/wOiweOAU5YXSBYNtZFOI3B9zA6NbiPs/AWAvWuqKCiJhRipoMgE10gpukyzdY58YcUNCg==","signature_status":"signed_v1","signed_at":"2026-05-27T01:04:57.105885Z","signed_message":"canonical_sha256_bytes"},"source_id":"2603.12564","source_kind":"arxiv","source_version":8}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:9df2c9cbb26db83a3f36085704d16a04c09427bbd818ead563e8584ef92116ea","sha256:599da38e8a4da28db00a616f8f242673b0e0c4b7c2de79c75a1f2973b0130acc"],"state_sha256":"0de79b078c2fda7f0b19e1b374393b5951646a2ed813704395322c5540c9a034"}