{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2022:DJHSBN54HU2VV2B7SIQRIVL5FE","short_pith_number":"pith:DJHSBN54","canonical_record":{"source":{"id":"2210.03350","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2022-10-07T06:50:23Z","cross_cats_sorted":[],"title_canon_sha256":"b51c2e7d513e3bc51f957d92064bdf2b30f25b753209fbaa8ff4a6266f98bb0d","abstract_canon_sha256":"b6f926ce0d2d9a6fb0798b7b877f894c9f7ab05f8cabc5a4854025fc477756e1"},"schema_version":"1.0"},"canonical_sha256":"1a4f20b7bc3d355ae83f922114557d291eb3fffea673138ac709f19448d57925","source":{"kind":"arxiv","id":"2210.03350","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2210.03350","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2210.03350v3","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2210.03350","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"DJHSBN54HU2V","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"DJHSBN54HU2VV2B7","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"DJHSBN54","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2022:DJHSBN54HU2VV2B7SIQRIVL5FE","target":"record","payload":{"canonical_record":{"source":{"id":"2210.03350","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2022-10-07T06:50:23Z","cross_cats_sorted":[],"title_canon_sha256":"b51c2e7d513e3bc51f957d92064bdf2b30f25b753209fbaa8ff4a6266f98bb0d","abstract_canon_sha256":"b6f926ce0d2d9a6fb0798b7b877f894c9f7ab05f8cabc5a4854025fc477756e1"},"schema_version":"1.0"},"canonical_sha256":"1a4f20b7bc3d355ae83f922114557d291eb3fffea673138ac709f19448d57925","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:13.480592Z","signature_b64":"7rQMLq/TcwYc9YaPhXeiAFoSi5v8pEFuvikC4mS9G3s/Ij7Bcl9VWlMXt+hJEeBzCq65pP/mBo2sCj11/EcmCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"1a4f20b7bc3d355ae83f922114557d291eb3fffea673138ac709f19448d57925","last_reissued_at":"2026-05-17T23:38:13.479942Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:13.479942Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2210.03350","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"NdX0tAYQsb0qvEyG6RLPldtN/1iI5uWKTLnkPkf/PqgK8Q7AZ+HNs30jPd/ZJaqeRvLMhlqVFbr22mvkaqwcBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T15:04:17.767742Z"},"content_sha256":"b619ab96c18f35c781b882e19689b115243affa8e0a4ee0dd3b33bebb0a3febf","schema_version":"1.0","event_id":"sha256:b619ab96c18f35c781b882e19689b115243affa8e0a4ee0dd3b33bebb0a3febf"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2022:DJHSBN54HU2VV2B7SIQRIVL5FE","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Measuring and Narrowing the Compositionality Gap in Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Larger language models improve single-fact recall faster than they improve the ability to compose multiple facts into answers.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Ludwig Schmidt, Mike Lewis, Muru Zhang, Noah A. Smith, Ofir Press, Sewon Min","submitted_at":"2022-10-07T06:50:23Z","abstract_excerpt":"We investigate the ability of language models to perform compositional reasoning tasks where the overall solution depends on correctly composing the answers to sub-problems. We measure how often models can correctly answer all sub-problems but not generate the overall solution, a ratio we call the compositionality gap. We evaluate this ratio by asking multi-hop questions with answers that require composing multiple facts unlikely to have been observed together during pretraining. In the GPT-3 family of models, as model size increases we show that the single-hop question answering performance i"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"In the GPT-3 family of models, as model size increases we show that the single-hop question answering performance improves faster than the multi-hop performance does, therefore the compositionality gap does not decrease.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the multi-hop questions are built from facts unlikely to have been observed together during pretraining, so that correct answers to the full question must come from composition rather than direct memorization of the combined fact.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Larger language models improve faster at single facts than at composing them, but self-ask prompting reduces the compositionality gap by forcing explicit intermediate questions.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Larger language models improve single-fact recall faster than they improve the ability to compose multiple facts into answers.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"76af897bafc5529bad8fa162060531c25baf0390f357b741305838bb6522eae3"},"source":{"id":"2210.03350","kind":"arxiv","version":3},"verdict":{"id":"c785ee43-207b-4c8f-b685-1dccaf41523a","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T17:45:40.663909Z","strongest_claim":"In the GPT-3 family of models, as model size increases we show that the single-hop question answering performance improves faster than the multi-hop performance does, therefore the compositionality gap does not decrease.","one_line_summary":"Larger language models improve faster at single facts than at composing them, but self-ask prompting reduces the compositionality gap by forcing explicit intermediate questions.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the multi-hop questions are built from facts unlikely to have been observed together during pretraining, so that correct answers to the full question must come from composition rather than direct memorization of the combined fact.","pith_extraction_headline":"Larger language models improve single-fact recall faster than they improve the ability to compose multiple facts into answers."},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"6e82f68289ec3891e4b7f18d1b5f02a210155d166ef9bfd2cd6426aee7c33f25"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"c785ee43-207b-4c8f-b685-1dccaf41523a"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"yu40/xa4YGTKaTjXP7xf/1C0JehKcQjMBXo2ZXf63q9xheYCWmmJfTmqXytz6S6Q2Sd+utenHZIFgpLygsjwBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T15:04:17.768589Z"},"content_sha256":"77785d08c1950b6a9653da1508f23f65c5710469078dae3a6c6934ae2df89940","schema_version":"1.0","event_id":"sha256:77785d08c1950b6a9653da1508f23f65c5710469078dae3a6c6934ae2df89940"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/DJHSBN54HU2VV2B7SIQRIVL5FE/bundle.json","state_url":"https://pith.science/pith/DJHSBN54HU2VV2B7SIQRIVL5FE/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/DJHSBN54HU2VV2B7SIQRIVL5FE/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-26T15:04:17Z","links":{"resolver":"https://pith.science/pith/DJHSBN54HU2VV2B7SIQRIVL5FE","bundle":"https://pith.science/pith/DJHSBN54HU2VV2B7SIQRIVL5FE/bundle.json","state":"https://pith.science/pith/DJHSBN54HU2VV2B7SIQRIVL5FE/state.json","well_known_bundle":"https://pith.science/.well-known/pith/DJHSBN54HU2VV2B7SIQRIVL5FE/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2022:DJHSBN54HU2VV2B7SIQRIVL5FE","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"b6f926ce0d2d9a6fb0798b7b877f894c9f7ab05f8cabc5a4854025fc477756e1","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2022-10-07T06:50:23Z","title_canon_sha256":"b51c2e7d513e3bc51f957d92064bdf2b30f25b753209fbaa8ff4a6266f98bb0d"},"schema_version":"1.0","source":{"id":"2210.03350","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2210.03350","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2210.03350v3","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2210.03350","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"DJHSBN54HU2V","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"DJHSBN54HU2VV2B7","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"DJHSBN54","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:77785d08c1950b6a9653da1508f23f65c5710469078dae3a6c6934ae2df89940","target":"graph","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"In the GPT-3 family of models, as model size increases we show that the single-hop question answering performance improves faster than the multi-hop performance does, therefore the compositionality gap does not decrease."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the multi-hop questions are built from facts unlikely to have been observed together during pretraining, so that correct answers to the full question must come from composition rather than direct memorization of the combined fact."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Larger language models improve faster at single facts than at composing them, but self-ask prompting reduces the compositionality gap by forcing explicit intermediate questions."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Larger language models improve single-fact recall faster than they improve the ability to compose multiple facts into answers."}],"snapshot_sha256":"76af897bafc5529bad8fa162060531c25baf0390f357b741305838bb6522eae3"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"6e82f68289ec3891e4b7f18d1b5f02a210155d166ef9bfd2cd6426aee7c33f25"},"paper":{"abstract_excerpt":"We investigate the ability of language models to perform compositional reasoning tasks where the overall solution depends on correctly composing the answers to sub-problems. We measure how often models can correctly answer all sub-problems but not generate the overall solution, a ratio we call the compositionality gap. We evaluate this ratio by asking multi-hop questions with answers that require composing multiple facts unlikely to have been observed together during pretraining. In the GPT-3 family of models, as model size increases we show that the single-hop question answering performance i","authors_text":"Ludwig Schmidt, Mike Lewis, Muru Zhang, Noah A. Smith, Ofir Press, Sewon Min","cross_cats":[],"headline":"Larger language models improve single-fact recall faster than they improve the ability to compose multiple facts into answers.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2022-10-07T06:50:23Z","title":"Measuring and Narrowing the Compositionality Gap in Language Models"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2210.03350","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-17T17:45:40.663909Z","id":"c785ee43-207b-4c8f-b685-1dccaf41523a","model_set":{"reader":"grok-4.3"},"one_line_summary":"Larger language models improve faster at single facts than at composing them, but self-ask prompting reduces the compositionality gap by forcing explicit intermediate questions.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Larger language models improve single-fact recall faster than they improve the ability to compose multiple facts into answers.","strongest_claim":"In the GPT-3 family of models, as model size increases we show that the single-hop question answering performance improves faster than the multi-hop performance does, therefore the compositionality gap does not decrease.","weakest_assumption":"That the multi-hop questions are built from facts unlikely to have been observed together during pretraining, so that correct answers to the full question must come from composition rather than direct memorization of the combined fact."}},"verdict_id":"c785ee43-207b-4c8f-b685-1dccaf41523a"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:b619ab96c18f35c781b882e19689b115243affa8e0a4ee0dd3b33bebb0a3febf","target":"record","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"b6f926ce0d2d9a6fb0798b7b877f894c9f7ab05f8cabc5a4854025fc477756e1","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2022-10-07T06:50:23Z","title_canon_sha256":"b51c2e7d513e3bc51f957d92064bdf2b30f25b753209fbaa8ff4a6266f98bb0d"},"schema_version":"1.0","source":{"id":"2210.03350","kind":"arxiv","version":3}},"canonical_sha256":"1a4f20b7bc3d355ae83f922114557d291eb3fffea673138ac709f19448d57925","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"1a4f20b7bc3d355ae83f922114557d291eb3fffea673138ac709f19448d57925","first_computed_at":"2026-05-17T23:38:13.479942Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:13.479942Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"7rQMLq/TcwYc9YaPhXeiAFoSi5v8pEFuvikC4mS9G3s/Ij7Bcl9VWlMXt+hJEeBzCq65pP/mBo2sCj11/EcmCw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:13.480592Z","signed_message":"canonical_sha256_bytes"},"source_id":"2210.03350","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:b619ab96c18f35c781b882e19689b115243affa8e0a4ee0dd3b33bebb0a3febf","sha256:77785d08c1950b6a9653da1508f23f65c5710469078dae3a6c6934ae2df89940"],"state_sha256":"bdae3661639f5d602da7883105db86ecd845904afbc1d00f212d36f6d31a6ceb"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"HcDQWlkVARimjRl+kEIF2eerBDTV7EuC0ezblsikPsuk0FX6029/ZoV0Z0944e3q+yrwUecNiE4j/g/kEQtmDg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-26T15:04:17.772350Z","bundle_sha256":"08790d53c3b81d7450b25796115944966cbe347ece83dd6ff864d90f3b216e85"}}