{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:WWMGNKMITKHW6I236PQ5LJCDZA","short_pith_number":"pith:WWMGNKMI","schema_version":"1.0","canonical_sha256":"b59866a9889a8f6f235bf3e1d5a443c8193104354d93b66fd0ae58feba7ea607","source":{"kind":"arxiv","id":"2311.02382","version":2},"attestation_state":"computed","paper":{"title":"Ultra-Long Sequence Distributed Transformer","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.DC","authors_text":"Aristeidis Tsaris, Hong-Jun Yoon, Isaac Lyngaas, John Gouley, Mayanka Chandra Shekar, Mohamed Wahib, Peng Chen, Sajal Dash, Tao Luo, Xiao Wang","submitted_at":"2023-11-04T11:38:53Z","abstract_excerpt":"Transformer models trained on long sequences often achieve higher accuracy than short sequences. Unfortunately, conventional transformers struggle with long sequence training due to the overwhelming computation and memory requirements. Existing methods for long sequence training offer limited speedup and memory reduction, and may compromise accuracy. This paper presents a novel and efficient distributed training method, the Long Short-Sequence Transformer (LSS Transformer), for training transformer with long sequences. It distributes a long sequence into segments among GPUs, with each GPU comp"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2311.02382","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.DC","submitted_at":"2023-11-04T11:38:53Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"c70e186cbc624522207fec85021fcf98c3f8fe79fce51a93de48f231daec130f","abstract_canon_sha256":"a7a0af2cd954f65e8c147b735e4c8df33c9c83bd80c76acd91eba94637b64f56"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-07-05T07:10:30.711593Z","signature_b64":"XnIipsztZ4hfYX6+2MZqGMmNqSYZu5MZiPSN1E3fbC2LGT1YDcHjD/rG5u3mqr25aBue16OFp4etr8r6vxehBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b59866a9889a8f6f235bf3e1d5a443c8193104354d93b66fd0ae58feba7ea607","last_reissued_at":"2026-07-05T07:10:30.711031Z","signature_status":"signed_v1","first_computed_at":"2026-07-05T07:10:30.711031Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Ultra-Long Sequence Distributed Transformer","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.DC","authors_text":"Aristeidis Tsaris, Hong-Jun Yoon, Isaac Lyngaas, John Gouley, Mayanka Chandra Shekar, Mohamed Wahib, Peng Chen, Sajal Dash, Tao Luo, Xiao Wang","submitted_at":"2023-11-04T11:38:53Z","abstract_excerpt":"Transformer models trained on long sequences often achieve higher accuracy than short sequences. Unfortunately, conventional transformers struggle with long sequence training due to the overwhelming computation and memory requirements. Existing methods for long sequence training offer limited speedup and memory reduction, and may compromise accuracy. This paper presents a novel and efficient distributed training method, the Long Short-Sequence Transformer (LSS Transformer), for training transformer with long sequences. It distributes a long sequence into segments among GPUs, with each GPU comp"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2311.02382","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2311.02382/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2311.02382","created_at":"2026-07-05T07:10:30.711088+00:00"},{"alias_kind":"arxiv_version","alias_value":"2311.02382v2","created_at":"2026-07-05T07:10:30.711088+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2311.02382","created_at":"2026-07-05T07:10:30.711088+00:00"},{"alias_kind":"pith_short_12","alias_value":"WWMGNKMITKHW","created_at":"2026-07-05T07:10:30.711088+00:00"},{"alias_kind":"pith_short_16","alias_value":"WWMGNKMITKHW6I23","created_at":"2026-07-05T07:10:30.711088+00:00"},{"alias_kind":"pith_short_8","alias_value":"WWMGNKMI","created_at":"2026-07-05T07:10:30.711088+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":0,"sample":[{"citing_arxiv_id":"2606.30460","citing_title":"HSAP: A Hierarchical Sequence-aware Parallelism for Hybrid-Context Generative Models","ref_index":25,"is_internal_anchor":false},{"citing_arxiv_id":"2606.30460","citing_title":"HSAP: A Hierarchical Sequence-aware Parallelism for Hybrid-Context Generative Models","ref_index":25,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/WWMGNKMITKHW6I236PQ5LJCDZA","json":"https://pith.science/pith/WWMGNKMITKHW6I236PQ5LJCDZA.json","graph_json":"https://pith.science/api/pith-number/WWMGNKMITKHW6I236PQ5LJCDZA/graph.json","events_json":"https://pith.science/api/pith-number/WWMGNKMITKHW6I236PQ5LJCDZA/events.json","paper":"https://pith.science/paper/WWMGNKMI"},"agent_actions":{"view_html":"https://pith.science/pith/WWMGNKMITKHW6I236PQ5LJCDZA","download_json":"https://pith.science/pith/WWMGNKMITKHW6I236PQ5LJCDZA.json","view_paper":"https://pith.science/paper/WWMGNKMI","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2311.02382&json=true","fetch_graph":"https://pith.science/api/pith-number/WWMGNKMITKHW6I236PQ5LJCDZA/graph.json","fetch_events":"https://pith.science/api/pith-number/WWMGNKMITKHW6I236PQ5LJCDZA/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/WWMGNKMITKHW6I236PQ5LJCDZA/action/timestamp_anchor","attest_storage":"https://pith.science/pith/WWMGNKMITKHW6I236PQ5LJCDZA/action/storage_attestation","attest_author":"https://pith.science/pith/WWMGNKMITKHW6I236PQ5LJCDZA/action/author_attestation","sign_citation":"https://pith.science/pith/WWMGNKMITKHW6I236PQ5LJCDZA/action/citation_signature","submit_replication":"https://pith.science/pith/WWMGNKMITKHW6I236PQ5LJCDZA/action/replication_record"}},"created_at":"2026-07-05T07:10:30.711088+00:00","updated_at":"2026-07-05T07:10:30.711088+00:00"}