{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:FN2ECR4GYERKGOU35ZF6Y22335","short_pith_number":"pith:FN2ECR4G","schema_version":"1.0","canonical_sha256":"2b74414786c122a33a9bee4bec6b5bdf6600847ae320c89e685fee0a95f29332","source":{"kind":"arxiv","id":"2406.16678","version":2},"attestation_state":"computed","paper":{"title":"Segment Any Text: A Universal Approach for Robust, Efficient and Adaptable Sentence Segmentation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Benjamin Minixhofer, Igor Sterner, Ivan Vuli\\'c, Markus Frohmann, Markus Schedl","submitted_at":"2024-06-24T14:36:11Z","abstract_excerpt":"Segmenting text into sentences plays an early and crucial role in many NLP systems. This is commonly achieved by using rule-based or statistical methods relying on lexical features such as punctuation. Although some recent works no longer exclusively rely on punctuation, we find that no prior method achieves all of (i) robustness to missing punctuation, (ii) effective adaptability to new domains, and (iii) high efficiency. We introduce a new model - Segment any Text (SaT) - to solve this problem. To enhance robustness, we propose a new pretraining scheme that ensures less reliance on punctuati"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2406.16678","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-06-24T14:36:11Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"301c661068821e28702f81c35d4334eb8339d4118a45126deb33fbfc2a3151f1","abstract_canon_sha256":"b168f8144b6f801871c97e99263a3c80ec6e05b1fbb24ef7d7c4e600582b1eef"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-05T09:14:54.718890Z","signature_b64":"WGR4WQ7PI6fFt/OXR8z86tPqCqkOt1Z/bZKoZFmouHiDWR//lrggwB8cekvV/8Gm11Iti8OjUKz9ch3u6PlLAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"2b74414786c122a33a9bee4bec6b5bdf6600847ae320c89e685fee0a95f29332","last_reissued_at":"2026-07-05T09:14:54.718353Z","signature_status":"signed_v1","first_computed_at":"2026-07-05T09:14:54.718353Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Segment Any Text: A Universal Approach for Robust, Efficient and Adaptable Sentence Segmentation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Benjamin Minixhofer, Igor Sterner, Ivan Vuli\\'c, Markus Frohmann, Markus Schedl","submitted_at":"2024-06-24T14:36:11Z","abstract_excerpt":"Segmenting text into sentences plays an early and crucial role in many NLP systems. This is commonly achieved by using rule-based or statistical methods relying on lexical features such as punctuation. Although some recent works no longer exclusively rely on punctuation, we find that no prior method achieves all of (i) robustness to missing punctuation, (ii) effective adaptability to new domains, and (iii) high efficiency. We introduce a new model - Segment any Text (SaT) - to solve this problem. To enhance robustness, we propose a new pretraining scheme that ensures less reliance on punctuati"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2406.16678","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2406.16678/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2406.16678","created_at":"2026-07-05T09:14:54.718416+00:00"},{"alias_kind":"arxiv_version","alias_value":"2406.16678v2","created_at":"2026-07-05T09:14:54.718416+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2406.16678","created_at":"2026-07-05T09:14:54.718416+00:00"},{"alias_kind":"pith_short_12","alias_value":"FN2ECR4GYERK","created_at":"2026-07-05T09:14:54.718416+00:00"},{"alias_kind":"pith_short_16","alias_value":"FN2ECR4GYERKGOU3","created_at":"2026-07-05T09:14:54.718416+00:00"},{"alias_kind":"pith_short_8","alias_value":"FN2ECR4G","created_at":"2026-07-05T09:14:54.718416+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/FN2ECR4GYERKGOU35ZF6Y22335","json":"https://pith.science/pith/FN2ECR4GYERKGOU35ZF6Y22335.json","graph_json":"https://pith.science/api/pith-number/FN2ECR4GYERKGOU35ZF6Y22335/graph.json","events_json":"https://pith.science/api/pith-number/FN2ECR4GYERKGOU35ZF6Y22335/events.json","paper":"https://pith.science/paper/FN2ECR4G"},"agent_actions":{"view_html":"https://pith.science/pith/FN2ECR4GYERKGOU35ZF6Y22335","download_json":"https://pith.science/pith/FN2ECR4GYERKGOU35ZF6Y22335.json","view_paper":"https://pith.science/paper/FN2ECR4G","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2406.16678&json=true","fetch_graph":"https://pith.science/api/pith-number/FN2ECR4GYERKGOU35ZF6Y22335/graph.json","fetch_events":"https://pith.science/api/pith-number/FN2ECR4GYERKGOU35ZF6Y22335/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/FN2ECR4GYERKGOU35ZF6Y22335/action/timestamp_anchor","attest_storage":"https://pith.science/pith/FN2ECR4GYERKGOU35ZF6Y22335/action/storage_attestation","attest_author":"https://pith.science/pith/FN2ECR4GYERKGOU35ZF6Y22335/action/author_attestation","sign_citation":"https://pith.science/pith/FN2ECR4GYERKGOU35ZF6Y22335/action/citation_signature","submit_replication":"https://pith.science/pith/FN2ECR4GYERKGOU35ZF6Y22335/action/replication_record"}},"created_at":"2026-07-05T09:14:54.718416+00:00","updated_at":"2026-07-05T09:14:54.718416+00:00"}