{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:PVXVFMUJYSRXGIRBPCL3YSZ6IL","short_pith_number":"pith:PVXVFMUJ","schema_version":"1.0","canonical_sha256":"7d6f52b289c4a37322217897bc4b3e42e200a1a3d6a1ed9bee554c548c192439","source":{"kind":"arxiv","id":"1803.02155","version":2},"attestation_state":"computed","paper":{"title":"Self-Attention with Relative Position Representations","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Ashish Vaswani, Jakob Uszkoreit, Peter Shaw","submitted_at":"2018-03-06T13:13:11Z","abstract_excerpt":"Relying entirely on an attention mechanism, the Transformer introduced by Vaswani et al. (2017) achieves state-of-the-art results for machine translation. In contrast to recurrent and convolutional neural networks, it does not explicitly model relative or absolute position information in its structure. Instead, it requires adding representations of absolute positions to its inputs. In this work we present an alternative approach, extending the self-attention mechanism to efficiently consider representations of the relative positions, or distances between sequence elements. On the WMT 2014 Engl"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1803.02155","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2018-03-06T13:13:11Z","cross_cats_sorted":[],"title_canon_sha256":"6a9850542dcc60c424491ebcb2cf8ba932ffb3999e724106632ca8c6f5502b0c","abstract_canon_sha256":"18910fc6caf6abb3300bb0c5872ea4e333e120d2d74f4d9e7503d566eab166c8"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:18:35.565573Z","signature_b64":"1AyP2PYCVMt4KbewyACpxt7OV2Ga/ULWaRF7D2SlXpxBFi9hkkGqvXRlWm7BrQ96sDhpW7op17+1GiT0uWdMBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"7d6f52b289c4a37322217897bc4b3e42e200a1a3d6a1ed9bee554c548c192439","last_reissued_at":"2026-05-18T00:18:35.565185Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:18:35.565185Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Self-Attention with Relative Position Representations","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Ashish Vaswani, Jakob Uszkoreit, Peter Shaw","submitted_at":"2018-03-06T13:13:11Z","abstract_excerpt":"Relying entirely on an attention mechanism, the Transformer introduced by Vaswani et al. (2017) achieves state-of-the-art results for machine translation. In contrast to recurrent and convolutional neural networks, it does not explicitly model relative or absolute position information in its structure. Instead, it requires adding representations of absolute positions to its inputs. In this work we present an alternative approach, extending the self-attention mechanism to efficiently consider representations of the relative positions, or distances between sequence elements. On the WMT 2014 Engl"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1803.02155","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1803.02155","created_at":"2026-05-18T00:18:35.565240+00:00"},{"alias_kind":"arxiv_version","alias_value":"1803.02155v2","created_at":"2026-05-18T00:18:35.565240+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1803.02155","created_at":"2026-05-18T00:18:35.565240+00:00"},{"alias_kind":"pith_short_12","alias_value":"PVXVFMUJYSRX","created_at":"2026-05-18T12:32:46.962924+00:00"},{"alias_kind":"pith_short_16","alias_value":"PVXVFMUJYSRXGIRB","created_at":"2026-05-18T12:32:46.962924+00:00"},{"alias_kind":"pith_short_8","alias_value":"PVXVFMUJ","created_at":"2026-05-18T12:32:46.962924+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":24,"internal_anchor_count":14,"sample":[{"citing_arxiv_id":"2406.11452","citing_title":"Attention-Based Deep Reinforcement Learning for Qubit Allocation in Modular Quantum Architectures","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2502.12370","citing_title":"Positional Encoding in Transformer-Based Time Series Models: A Survey","ref_index":52,"is_internal_anchor":true},{"citing_arxiv_id":"2504.14386","citing_title":"LOOPE: Learnable Optimal Patch Order in Positional Embeddings for Vision Transformers","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2505.12136","citing_title":"Lightweight Spatio-Temporal Attention Network with Graph Embedding and Rotational Position Encoding for Traffic Forecasting","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21081","citing_title":"Musical Attention Transformer: Music Generation Using a Music-Specific Attention Model","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2505.20032","citing_title":"ViTaPEs: Visuotactile Position Encodings for Cross-Modal Alignment in Multimodal Transformers","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2506.09323","citing_title":"Learning-Optimized Qubit Mapping and Reuse to Minimize Inter-Core Communication in Modular Quantum Architectures","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2509.04154","citing_title":"Robust Filter Attention: Self-Attention as Precision-Weighted State Estimation","ref_index":80,"is_internal_anchor":true},{"citing_arxiv_id":"2509.14640","citing_title":"DyWPE: Signal-Aware Dynamic Wavelet Positional Encoding for Time Series Transformers","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2207.14255","citing_title":"Efficient Training of Language Models to Fill in the Middle","ref_index":135,"is_internal_anchor":true},{"citing_arxiv_id":"2511.17388","citing_title":"Selective Rotary Position Embedding","ref_index":56,"is_internal_anchor":true},{"citing_arxiv_id":"2512.07805","citing_title":"Group Representational Position Encoding","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2601.16933","citing_title":"Reward-Forcing: Autoregressive Video Generation with Reward Feedback","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2311.16867","citing_title":"The Falcon Series of Open Language Models","ref_index":146,"is_internal_anchor":true},{"citing_arxiv_id":"2510.26692","citing_title":"Kimi Linear: An Expressive, Efficient Attention Architecture","ref_index":89,"is_internal_anchor":false},{"citing_arxiv_id":"2406.00515","citing_title":"A Survey on Large Language Models for Code Generation","ref_index":237,"is_internal_anchor":false},{"citing_arxiv_id":"2204.03458","citing_title":"Video Diffusion Models","ref_index":45,"is_internal_anchor":false},{"citing_arxiv_id":"2604.27559","citing_title":"RIHA: Report-Image Hierarchical Alignment for Radiology Report Generation","ref_index":68,"is_internal_anchor":false},{"citing_arxiv_id":"1910.10683","citing_title":"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer","ref_index":66,"is_internal_anchor":false},{"citing_arxiv_id":"2605.04682","citing_title":"HEXST: Hexagonal Shifted-Window Transformer for Spatial Transcriptomics Gene Expression Prediction","ref_index":20,"is_internal_anchor":false},{"citing_arxiv_id":"2605.04198","citing_title":"Deep Wave Network for Modeling Multi-Scale Physical Dynamics","ref_index":65,"is_internal_anchor":false},{"citing_arxiv_id":"2402.06196","citing_title":"Large Language Models: A Survey","ref_index":126,"is_internal_anchor":false},{"citing_arxiv_id":"2604.18603","citing_title":"Dual Triangle Attention: Effective Bidirectional Attention Without Positional Embeddings","ref_index":23,"is_internal_anchor":false},{"citing_arxiv_id":"2604.20789","citing_title":"Working Memory Constraints Scaffold Learning in Transformers under Data Scarcity","ref_index":38,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/PVXVFMUJYSRXGIRBPCL3YSZ6IL","json":"https://pith.science/pith/PVXVFMUJYSRXGIRBPCL3YSZ6IL.json","graph_json":"https://pith.science/api/pith-number/PVXVFMUJYSRXGIRBPCL3YSZ6IL/graph.json","events_json":"https://pith.science/api/pith-number/PVXVFMUJYSRXGIRBPCL3YSZ6IL/events.json","paper":"https://pith.science/paper/PVXVFMUJ"},"agent_actions":{"view_html":"https://pith.science/pith/PVXVFMUJYSRXGIRBPCL3YSZ6IL","download_json":"https://pith.science/pith/PVXVFMUJYSRXGIRBPCL3YSZ6IL.json","view_paper":"https://pith.science/paper/PVXVFMUJ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1803.02155&json=true","fetch_graph":"https://pith.science/api/pith-number/PVXVFMUJYSRXGIRBPCL3YSZ6IL/graph.json","fetch_events":"https://pith.science/api/pith-number/PVXVFMUJYSRXGIRBPCL3YSZ6IL/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/PVXVFMUJYSRXGIRBPCL3YSZ6IL/action/timestamp_anchor","attest_storage":"https://pith.science/pith/PVXVFMUJYSRXGIRBPCL3YSZ6IL/action/storage_attestation","attest_author":"https://pith.science/pith/PVXVFMUJYSRXGIRBPCL3YSZ6IL/action/author_attestation","sign_citation":"https://pith.science/pith/PVXVFMUJYSRXGIRBPCL3YSZ6IL/action/citation_signature","submit_replication":"https://pith.science/pith/PVXVFMUJYSRXGIRBPCL3YSZ6IL/action/replication_record"}},"created_at":"2026-05-18T00:18:35.565240+00:00","updated_at":"2026-05-18T00:18:35.565240+00:00"}