{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:PNNM24TC53QBWCDELBLMEJR7WW","short_pith_number":"pith:PNNM24TC","schema_version":"1.0","canonical_sha256":"7b5acd7262eee01b08645856c2263fb5a9bd4261bfe65b104f9c9b0c9efbdd5d","source":{"kind":"arxiv","id":"1803.05046","version":1},"attestation_state":"computed","paper":{"title":"Caveat Emptor, Computational Social Science: Large-Scale Missing Data in a Widely-Published Reddit Corpus","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.SI","authors_text":"Devin Gaffney, J. Nathan Matias","submitted_at":"2018-03-13T21:07:52Z","abstract_excerpt":"As researchers use computational methods to study complex social behaviors at scale, the validity of this computational social science depends on the integrity of the data. On July 2, 2015, Jason Baumgartner published a dataset advertised to include ``every publicly available Reddit comment'' which was quickly shared on Bittorrent and the Internet Archive. This data quickly became the basis of many academic papers on topics including machine learning, social behavior, politics, breaking news, and hate speech. We have discovered substantial gaps and limitations in this dataset which may contrib"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1803.05046","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.SI","submitted_at":"2018-03-13T21:07:52Z","cross_cats_sorted":[],"title_canon_sha256":"979e496598a2256ddb25160399f3b285d3e0827d071a8b2a243ac87896e65e50","abstract_canon_sha256":"c9c97bc47a95662ed271613d71c83ef9f7b07fa998598b453928caac945ce6bd"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:06:40.452795Z","signature_b64":"lJebpgYEhcVZ1BDEN2Mv9YqpINaKqgwIV3PMM4qhBKK5oUy11MPTQyd05YtYXe6l7Yrh79jzyGl8lKiTo9LqDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7b5acd7262eee01b08645856c2263fb5a9bd4261bfe65b104f9c9b0c9efbdd5d","last_reissued_at":"2026-05-18T00:06:40.452279Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:06:40.452279Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Caveat Emptor, Computational Social Science: Large-Scale Missing Data in a Widely-Published Reddit Corpus","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.SI","authors_text":"Devin Gaffney, J. Nathan Matias","submitted_at":"2018-03-13T21:07:52Z","abstract_excerpt":"As researchers use computational methods to study complex social behaviors at scale, the validity of this computational social science depends on the integrity of the data. On July 2, 2015, Jason Baumgartner published a dataset advertised to include ``every publicly available Reddit comment'' which was quickly shared on Bittorrent and the Internet Archive. This data quickly became the basis of many academic papers on topics including machine learning, social behavior, politics, breaking news, and hate speech. We have discovered substantial gaps and limitations in this dataset which may contrib"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1803.05046","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1803.05046","created_at":"2026-05-18T00:06:40.452371+00:00"},{"alias_kind":"arxiv_version","alias_value":"1803.05046v1","created_at":"2026-05-18T00:06:40.452371+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1803.05046","created_at":"2026-05-18T00:06:40.452371+00:00"},{"alias_kind":"pith_short_12","alias_value":"PNNM24TC53QB","created_at":"2026-05-18T12:32:46.962924+00:00"},{"alias_kind":"pith_short_16","alias_value":"PNNM24TC53QBWCDE","created_at":"2026-05-18T12:32:46.962924+00:00"},{"alias_kind":"pith_short_8","alias_value":"PNNM24TC","created_at":"2026-05-18T12:32:46.962924+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/PNNM24TC53QBWCDELBLMEJR7WW","json":"https://pith.science/pith/PNNM24TC53QBWCDELBLMEJR7WW.json","graph_json":"https://pith.science/api/pith-number/PNNM24TC53QBWCDELBLMEJR7WW/graph.json","events_json":"https://pith.science/api/pith-number/PNNM24TC53QBWCDELBLMEJR7WW/events.json","paper":"https://pith.science/paper/PNNM24TC"},"agent_actions":{"view_html":"https://pith.science/pith/PNNM24TC53QBWCDELBLMEJR7WW","download_json":"https://pith.science/pith/PNNM24TC53QBWCDELBLMEJR7WW.json","view_paper":"https://pith.science/paper/PNNM24TC","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1803.05046&json=true","fetch_graph":"https://pith.science/api/pith-number/PNNM24TC53QBWCDELBLMEJR7WW/graph.json","fetch_events":"https://pith.science/api/pith-number/PNNM24TC53QBWCDELBLMEJR7WW/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/PNNM24TC53QBWCDELBLMEJR7WW/action/timestamp_anchor","attest_storage":"https://pith.science/pith/PNNM24TC53QBWCDELBLMEJR7WW/action/storage_attestation","attest_author":"https://pith.science/pith/PNNM24TC53QBWCDELBLMEJR7WW/action/author_attestation","sign_citation":"https://pith.science/pith/PNNM24TC53QBWCDELBLMEJR7WW/action/citation_signature","submit_replication":"https://pith.science/pith/PNNM24TC53QBWCDELBLMEJR7WW/action/replication_record"}},"created_at":"2026-05-18T00:06:40.452371+00:00","updated_at":"2026-05-18T00:06:40.452371+00:00"}