{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:NJNI2GAUBWCGMDJAN5JBUKZGUF","short_pith_number":"pith:NJNI2GAU","schema_version":"1.0","canonical_sha256":"6a5a8d18140d84660d206f521a2b26a15833d0e396c5cca54e47e866e6072220","source":{"kind":"arxiv","id":"1806.03901","version":1},"attestation_state":"computed","paper":{"title":"A Cost-based Storage Format Selector for Materialization in Big Data Frameworks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.DC","authors_text":"Alberto Abell\\'o, Maik Thiele, Oscar Romero, Rana Faisal Munir, Wolfgang Lehner","submitted_at":"2018-06-11T10:58:04Z","abstract_excerpt":"Modern big data frameworks (such as Hadoop and Spark) allow multiple users to do large-scale analysis simultaneously. Typically, users deploy Data-Intensive Workflows (DIWs) for their analytical tasks. These DIWs of different users share many common parts (i.e, 50-80%), which can be materialized to reuse them in future executions. The materialization improves the overall processing time of DIWs and also saves computational resources. Current solutions for materialization store data on Distributed File Systems (DFS) by using a fixed data format. However, a fixed choice might not be the optimal "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1806.03901","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.DC","submitted_at":"2018-06-11T10:58:04Z","cross_cats_sorted":[],"title_canon_sha256":"2818d14491043ac10d1ebb38fde4890523351226626b28ae20303d81bec3c4e7","abstract_canon_sha256":"68577e53f991cc6d47e3cacac43813944708f1ffc9a0ebdf83a5a80250c9c95c"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:13:41.616827Z","signature_b64":"HMeT55l/m6abEm42bt/ERec58PmxkxcDRf8JXBjmWhN9DF1aTd5P5AYVfzhdVD9nBbt3fFAZxB8uh2KM9rh7BQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"6a5a8d18140d84660d206f521a2b26a15833d0e396c5cca54e47e866e6072220","last_reissued_at":"2026-05-18T00:13:41.616040Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:13:41.616040Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"A Cost-based Storage Format Selector for Materialization in Big Data Frameworks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.DC","authors_text":"Alberto Abell\\'o, Maik Thiele, Oscar Romero, Rana Faisal Munir, Wolfgang Lehner","submitted_at":"2018-06-11T10:58:04Z","abstract_excerpt":"Modern big data frameworks (such as Hadoop and Spark) allow multiple users to do large-scale analysis simultaneously. Typically, users deploy Data-Intensive Workflows (DIWs) for their analytical tasks. These DIWs of different users share many common parts (i.e, 50-80%), which can be materialized to reuse them in future executions. The materialization improves the overall processing time of DIWs and also saves computational resources. Current solutions for materialization store data on Distributed File Systems (DFS) by using a fixed data format. However, a fixed choice might not be the optimal "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1806.03901","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1806.03901","created_at":"2026-05-18T00:13:41.616180+00:00"},{"alias_kind":"arxiv_version","alias_value":"1806.03901v1","created_at":"2026-05-18T00:13:41.616180+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1806.03901","created_at":"2026-05-18T00:13:41.616180+00:00"},{"alias_kind":"pith_short_12","alias_value":"NJNI2GAUBWCG","created_at":"2026-05-18T12:32:40.477152+00:00"},{"alias_kind":"pith_short_16","alias_value":"NJNI2GAUBWCGMDJA","created_at":"2026-05-18T12:32:40.477152+00:00"},{"alias_kind":"pith_short_8","alias_value":"NJNI2GAU","created_at":"2026-05-18T12:32:40.477152+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/NJNI2GAUBWCGMDJAN5JBUKZGUF","json":"https://pith.science/pith/NJNI2GAUBWCGMDJAN5JBUKZGUF.json","graph_json":"https://pith.science/api/pith-number/NJNI2GAUBWCGMDJAN5JBUKZGUF/graph.json","events_json":"https://pith.science/api/pith-number/NJNI2GAUBWCGMDJAN5JBUKZGUF/events.json","paper":"https://pith.science/paper/NJNI2GAU"},"agent_actions":{"view_html":"https://pith.science/pith/NJNI2GAUBWCGMDJAN5JBUKZGUF","download_json":"https://pith.science/pith/NJNI2GAUBWCGMDJAN5JBUKZGUF.json","view_paper":"https://pith.science/paper/NJNI2GAU","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1806.03901&json=true","fetch_graph":"https://pith.science/api/pith-number/NJNI2GAUBWCGMDJAN5JBUKZGUF/graph.json","fetch_events":"https://pith.science/api/pith-number/NJNI2GAUBWCGMDJAN5JBUKZGUF/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/NJNI2GAUBWCGMDJAN5JBUKZGUF/action/timestamp_anchor","attest_storage":"https://pith.science/pith/NJNI2GAUBWCGMDJAN5JBUKZGUF/action/storage_attestation","attest_author":"https://pith.science/pith/NJNI2GAUBWCGMDJAN5JBUKZGUF/action/author_attestation","sign_citation":"https://pith.science/pith/NJNI2GAUBWCGMDJAN5JBUKZGUF/action/citation_signature","submit_replication":"https://pith.science/pith/NJNI2GAUBWCGMDJAN5JBUKZGUF/action/replication_record"}},"created_at":"2026-05-18T00:13:41.616180+00:00","updated_at":"2026-05-18T00:13:41.616180+00:00"}