{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:YGTHSWTW6DYMD4OPVDZ2OJXDRE","short_pith_number":"pith:YGTHSWTW","schema_version":"1.0","canonical_sha256":"c1a6795a76f0f0c1f1cfa8f3a726e3892f019efecf475a5686b6f4fcaafbb1fd","source":{"kind":"arxiv","id":"2606.24346","version":1},"attestation_state":"computed","paper":{"title":"PETRA: Transforming Web Text for Petroleum-Engineering Domain Adaptation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.IR","authors_text":"(2) Inception AI), Adrian Garcia-Garcia (2), Aya El Mir (1), Federico Castanedo (2), Hachem Madmoun (1), Kirill Dubovikov (1), Larry Murray (2), Martin Takac (1), Omar El Mansouri (1), Onkar Pandit (2), Salem Lahlou (1) ((1) Mohamed bin Zayed University of Artificial Intelligence, Sandeep Kumar (1), Sunil Kumar Sahu (2), Supriyo Ghosh (2), Writabrata Bhattacharya (2), Yanda Li (1)","submitted_at":"2026-06-23T09:37:44Z","abstract_excerpt":"Petroleum-engineering search exposes a supervision gap for strong general retrievers: relevant evidence exists in public web text, but domain relevance labels are scarce. To address this gap, we propose PETRA, a large-scale Petroleum Engineering Text for Retrieval Adaptation dataset and pipeline that converts noisy public web data into a curated domain corpus and synthetic supervision for dense retrieval and reranking. PETRA contains 1.36M curated chunks, approximately 2B token equivalents, $\\approx$859k, embedding training rows from $\\approx$224k anchors, and roughly 400k teacher-scored reran"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.24346","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.IR","submitted_at":"2026-06-23T09:37:44Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"3fdb0661c6eee0d40021d26d435c11de212be61588c4c8c2ae3ddc2fc7c1efdd","abstract_canon_sha256":"8010516bb11508f770a44f79051a793d4be4d2c78eec503e9c441ed0b343fedb"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-24T01:15:27.867126Z","signature_b64":"L2b9jMK/sT1ElSW4bfBPpL2ugCzJKrFNxskRLfzqzNqVuWBR06LARR53slQAx2vj5S5SGOX6829V0Pmv5wGVBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c1a6795a76f0f0c1f1cfa8f3a726e3892f019efecf475a5686b6f4fcaafbb1fd","last_reissued_at":"2026-06-24T01:15:27.866703Z","signature_status":"signed_v1","first_computed_at":"2026-06-24T01:15:27.866703Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"PETRA: Transforming Web Text for Petroleum-Engineering Domain Adaptation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.IR","authors_text":"(2) Inception AI), Adrian Garcia-Garcia (2), Aya El Mir (1), Federico Castanedo (2), Hachem Madmoun (1), Kirill Dubovikov (1), Larry Murray (2), Martin Takac (1), Omar El Mansouri (1), Onkar Pandit (2), Salem Lahlou (1) ((1) Mohamed bin Zayed University of Artificial Intelligence, Sandeep Kumar (1), Sunil Kumar Sahu (2), Supriyo Ghosh (2), Writabrata Bhattacharya (2), Yanda Li (1)","submitted_at":"2026-06-23T09:37:44Z","abstract_excerpt":"Petroleum-engineering search exposes a supervision gap for strong general retrievers: relevant evidence exists in public web text, but domain relevance labels are scarce. To address this gap, we propose PETRA, a large-scale Petroleum Engineering Text for Retrieval Adaptation dataset and pipeline that converts noisy public web data into a curated domain corpus and synthetic supervision for dense retrieval and reranking. PETRA contains 1.36M curated chunks, approximately 2B token equivalents, $\\approx$859k, embedding training rows from $\\approx$224k anchors, and roughly 400k teacher-scored reran"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.24346","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.24346/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.24346","created_at":"2026-06-24T01:15:27.866759+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.24346v1","created_at":"2026-06-24T01:15:27.866759+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.24346","created_at":"2026-06-24T01:15:27.866759+00:00"},{"alias_kind":"pith_short_12","alias_value":"YGTHSWTW6DYM","created_at":"2026-06-24T01:15:27.866759+00:00"},{"alias_kind":"pith_short_16","alias_value":"YGTHSWTW6DYMD4OP","created_at":"2026-06-24T01:15:27.866759+00:00"},{"alias_kind":"pith_short_8","alias_value":"YGTHSWTW","created_at":"2026-06-24T01:15:27.866759+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/YGTHSWTW6DYMD4OPVDZ2OJXDRE","json":"https://pith.science/pith/YGTHSWTW6DYMD4OPVDZ2OJXDRE.json","graph_json":"https://pith.science/api/pith-number/YGTHSWTW6DYMD4OPVDZ2OJXDRE/graph.json","events_json":"https://pith.science/api/pith-number/YGTHSWTW6DYMD4OPVDZ2OJXDRE/events.json","paper":"https://pith.science/paper/YGTHSWTW"},"agent_actions":{"view_html":"https://pith.science/pith/YGTHSWTW6DYMD4OPVDZ2OJXDRE","download_json":"https://pith.science/pith/YGTHSWTW6DYMD4OPVDZ2OJXDRE.json","view_paper":"https://pith.science/paper/YGTHSWTW","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.24346&json=true","fetch_graph":"https://pith.science/api/pith-number/YGTHSWTW6DYMD4OPVDZ2OJXDRE/graph.json","fetch_events":"https://pith.science/api/pith-number/YGTHSWTW6DYMD4OPVDZ2OJXDRE/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/YGTHSWTW6DYMD4OPVDZ2OJXDRE/action/timestamp_anchor","attest_storage":"https://pith.science/pith/YGTHSWTW6DYMD4OPVDZ2OJXDRE/action/storage_attestation","attest_author":"https://pith.science/pith/YGTHSWTW6DYMD4OPVDZ2OJXDRE/action/author_attestation","sign_citation":"https://pith.science/pith/YGTHSWTW6DYMD4OPVDZ2OJXDRE/action/citation_signature","submit_replication":"https://pith.science/pith/YGTHSWTW6DYMD4OPVDZ2OJXDRE/action/replication_record"}},"created_at":"2026-06-24T01:15:27.866759+00:00","updated_at":"2026-06-24T01:15:27.866759+00:00"}