{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2014:22NAFFMJC7OI2OMTNUYEGOLWWH","short_pith_number":"pith:22NAFFMJ","schema_version":"1.0","canonical_sha256":"d69a02958917dc8d39936d30433976b1ca887e81e4e9cc6f8c9c25db396ab74b","source":{"kind":"arxiv","id":"1408.5427","version":1},"attestation_state":"computed","paper":{"title":"A Case Study in Text Mining: Interpreting Twitter Data From World Cup Tweets","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL","cs.IR","cs.LG"],"primary_cat":"stat.ML","authors_text":"Caley Johns, Carl Meyer, Carol Sadek, Daniel Godfrey, Shaina Race","submitted_at":"2014-08-21T17:58:33Z","abstract_excerpt":"Cluster analysis is a field of data analysis that extracts underlying patterns in data. One application of cluster analysis is in text-mining, the analysis of large collections of text to find similarities between documents. We used a collection of about 30,000 tweets extracted from Twitter just before the World Cup started. A common problem with real world text data is the presence of linguistic noise. In our case it would be extraneous tweets that are unrelated to dominant themes. To combat this problem, we created an algorithm that combined the DBSCAN algorithm and a consensus matrix. This "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1408.5427","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"stat.ML","submitted_at":"2014-08-21T17:58:33Z","cross_cats_sorted":["cs.CL","cs.IR","cs.LG"],"title_canon_sha256":"1f61f535f50a72e5d5794a235f180d203f6123eec8998f46cf20d444b6f84b47","abstract_canon_sha256":"3f7f59b645e0cd340d32471744e879645feb20eb0d44175a9d06a43e149165a8"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:44:25.203988Z","signature_b64":"6Tw+3NxF2ZoOeSyhOtFDOTDE54JobaJ7QV+MHLIfZrmrXs5Gi+XUDko5vNoS3JftDYj4BC8jeC3qn2zenndxDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d69a02958917dc8d39936d30433976b1ca887e81e4e9cc6f8c9c25db396ab74b","last_reissued_at":"2026-05-18T02:44:25.203583Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:44:25.203583Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"A Case Study in Text Mining: Interpreting Twitter Data From World Cup Tweets","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL","cs.IR","cs.LG"],"primary_cat":"stat.ML","authors_text":"Caley Johns, Carl Meyer, Carol Sadek, Daniel Godfrey, Shaina Race","submitted_at":"2014-08-21T17:58:33Z","abstract_excerpt":"Cluster analysis is a field of data analysis that extracts underlying patterns in data. One application of cluster analysis is in text-mining, the analysis of large collections of text to find similarities between documents. We used a collection of about 30,000 tweets extracted from Twitter just before the World Cup started. A common problem with real world text data is the presence of linguistic noise. In our case it would be extraneous tweets that are unrelated to dominant themes. To combat this problem, we created an algorithm that combined the DBSCAN algorithm and a consensus matrix. This "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1408.5427","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1408.5427","created_at":"2026-05-18T02:44:25.203650+00:00"},{"alias_kind":"arxiv_version","alias_value":"1408.5427v1","created_at":"2026-05-18T02:44:25.203650+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1408.5427","created_at":"2026-05-18T02:44:25.203650+00:00"},{"alias_kind":"pith_short_12","alias_value":"22NAFFMJC7OI","created_at":"2026-05-18T12:28:09.283467+00:00"},{"alias_kind":"pith_short_16","alias_value":"22NAFFMJC7OI2OMT","created_at":"2026-05-18T12:28:09.283467+00:00"},{"alias_kind":"pith_short_8","alias_value":"22NAFFMJ","created_at":"2026-05-18T12:28:09.283467+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/22NAFFMJC7OI2OMTNUYEGOLWWH","json":"https://pith.science/pith/22NAFFMJC7OI2OMTNUYEGOLWWH.json","graph_json":"https://pith.science/api/pith-number/22NAFFMJC7OI2OMTNUYEGOLWWH/graph.json","events_json":"https://pith.science/api/pith-number/22NAFFMJC7OI2OMTNUYEGOLWWH/events.json","paper":"https://pith.science/paper/22NAFFMJ"},"agent_actions":{"view_html":"https://pith.science/pith/22NAFFMJC7OI2OMTNUYEGOLWWH","download_json":"https://pith.science/pith/22NAFFMJC7OI2OMTNUYEGOLWWH.json","view_paper":"https://pith.science/paper/22NAFFMJ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1408.5427&json=true","fetch_graph":"https://pith.science/api/pith-number/22NAFFMJC7OI2OMTNUYEGOLWWH/graph.json","fetch_events":"https://pith.science/api/pith-number/22NAFFMJC7OI2OMTNUYEGOLWWH/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/22NAFFMJC7OI2OMTNUYEGOLWWH/action/timestamp_anchor","attest_storage":"https://pith.science/pith/22NAFFMJC7OI2OMTNUYEGOLWWH/action/storage_attestation","attest_author":"https://pith.science/pith/22NAFFMJC7OI2OMTNUYEGOLWWH/action/author_attestation","sign_citation":"https://pith.science/pith/22NAFFMJC7OI2OMTNUYEGOLWWH/action/citation_signature","submit_replication":"https://pith.science/pith/22NAFFMJC7OI2OMTNUYEGOLWWH/action/replication_record"}},"created_at":"2026-05-18T02:44:25.203650+00:00","updated_at":"2026-05-18T02:44:25.203650+00:00"}