{"work":{"id":"7f61d5d0-82cd-471c-9ea6-17c31d56d24e","openalex_id":null,"doi":null,"arxiv_id":"1710.09412","raw_key":null,"title":"mixup: Beyond Empirical Risk Minimization","authors":null,"authors_text":"Hongyi Zhang, Moustapha Cisse, Yann N. Dauphin, David Lopez-Paz","year":2017,"venue":"cs.LG","abstract":"Large deep neural networks are powerful, but exhibit undesirable behaviors such as memorization and sensitivity to adversarial examples. In this work, we propose mixup, a simple learning principle to alleviate these issues. In essence, mixup trains a neural network on convex combinations of pairs of examples and their labels. By doing so, mixup regularizes the neural network to favor simple linear behavior in-between training examples. Our experiments on the ImageNet-2012, CIFAR-10, CIFAR-100, Google commands and UCI datasets show that mixup improves the generalization of state-of-the-art neural network architectures. We also find that mixup reduces the memorization of corrupt labels, increases the robustness to adversarial examples, and stabilizes the training of generative adversarial networks.","external_url":"https://arxiv.org/abs/1710.09412","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-25T19:31:10.433801+00:00","pith_arxiv_id":"1710.09412","created_at":"2026-05-08T21:09:12.855395+00:00","updated_at":"2026-06-05T21:23:00.469572+00:00","title_quality_ok":true,"display_title":"mixup: Beyond Empirical Risk Minimization","render_title":"mixup: Beyond Empirical Risk Minimization"},"hub":{"state":{"work_id":"7f61d5d0-82cd-471c-9ea6-17c31d56d24e","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":65,"external_cited_by_count":null,"distinct_field_count":9,"first_pith_cited_at":"2019-06-20T08:44:03+00:00","last_pith_cited_at":"2026-05-20T05:24:43+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-11T20:18:51.689334+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":6},{"context_role":"method","n":6},{"context_role":"baseline","n":2},{"context_role":"other","n":1}],"polarity_counts":[{"context_polarity":"background","n":7},{"context_polarity":"use_method","n":6},{"context_polarity":"baseline","n":2}],"runs":{"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T18:09:51.165403+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","work_id":"e96730e3-129b-4db6-b981-15ab7932e297","shared_citers":7},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":7},{"title":"Invariant Risk Minimization","work_id":"d76c6842-b84d-44ec-bcea-b80cd8d07981","shared_citers":5},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":4},{"title":"Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift","work_id":"05484516-8937-4cdf-9176-7f8329ef0221","shared_citers":3},{"title":"BEiT: BERT Pre-Training of Image Transformers","work_id":"d74eda3c-bf7e-45f1-a8f1-a0137ecca3f4","shared_citers":3},{"title":"Deep Residual Learning for Image Recognition","work_id":"ae9e5671-23e8-4853-82a4-699b5b8dd639","shared_citers":3},{"title":"Improved Regularization of Convolutional Neural Networks with Cutout","work_id":"a3bf8477-f913-4f6a-8e36-125767300d1f","shared_citers":3},{"title":"arXiv preprint arXiv:1708.04896 (2017)","work_id":"968eeb9d-f5db-4e4c-8bcb-e2c28a6250d1","shared_citers":2},{"title":"arXiv preprint arXiv:1905.04899 (2019)","work_id":"ce71f921-03a9-4994-ba80-2e5f6486559b","shared_citers":2},{"title":"arXiv preprint arXiv:2006.10726 (2020) 2, 3, 12, 13, 36","work_id":"7f343a70-32a2-42dc-945a-66a6feb179fb","shared_citers":2},{"title":"arXiv preprint arXiv:2010.05761 (2020) 4, 8","work_id":"9f465ee9-fbd1-4205-aee5-7ca66f1e7cb9","shared_citers":2},{"title":"DINOv2: Learning Robust Visual Features without Supervision","work_id":"26b304e5-b54a-4f26-be7e-83299eca52e4","shared_citers":2},{"title":"DINOv3","work_id":"c8b07deb-8fe7-4e18-9620-f3569d3529ce","shared_citers":2},{"title":"Distributionally Robust Neural Networks for Group Shifts: On the Importance of Regularization for Worst-Case Generalization","work_id":"b9385d0d-bafd-43d3-8948-4d2da8ee27a0","shared_citers":2},{"title":"FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning","work_id":"fff3953b-5efb-4753-bee4-002f59995810","shared_citers":2},{"title":"Hierarchical Text-Conditional Image Generation with CLIP Latents","work_id":"0c6a768b-70b8-4242-bb0e-459f1008c9fc","shared_citers":2},{"title":"Improved Baselines with Momentum Contrastive Learning","work_id":"f275e715-bcdc-487c-bc31-6f98d8a01f5c","shared_citers":2},{"title":"In: Proceedings of the IEEE/CVF international conference on computer vision","work_id":"abf093d9-2016-4efd-a534-1bfcbcb4a799","shared_citers":2},{"title":"Junnan Li, Richard Socher, and Steven CH Hoi","work_id":"517ccf84-2677-4c02-9951-ed1d3f9aa1c6","shared_citers":2},{"title":"MONAI: An open-source framework for deep learning in healthcare","work_id":"5bb8ee7d-31fa-4bb6-aef2-8b87a6d465b4","shared_citers":2},{"title":"Patchcraft: Exploring texture patch for efficient ai-generated image detection","work_id":"f5883221-d53f-4bba-94d4-bcc918a6f4e9","shared_citers":2},{"title":"Proceedings of the eleventh annual conference on Computational learning theory , pages=","work_id":"9999350e-f57f-46d9-87c6-5e7dddb6b3b9","shared_citers":2},{"title":"Progressive Growing of GANs for Improved Quality, Stability, and Variation","work_id":"5e8c5f57-fe56-4018-ac90-b6b3b849f420","shared_citers":2}],"time_series":[{"n":2,"year":2020},{"n":2,"year":2024},{"n":1,"year":2025},{"n":33,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T18:09:36.206764+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T18:09:40.441523+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"mixup: Beyond Empirical Risk Minimization","claims":[{"claim_text":"Large deep neural networks are powerful, but exhibit undesirable behaviors such as memorization and sensitivity to adversarial examples. In this work, we propose mixup, a simple learning principle to alleviate these issues. In essence, mixup trains a neural network on convex combinations of pairs of examples and their labels. By doing so, mixup regularizes the neural network to favor simple linear behavior in-between training examples. Our experiments on the ImageNet-2012, CIFAR-10, CIFAR-100, Google commands and UCI datasets show that mixup improves the generalization of state-of-the-art neur","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks mixup: Beyond Empirical Risk Minimization because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T18:09:36.154098+00:00"}},"summary":{"title":"mixup: Beyond Empirical Risk Minimization","claims":[{"claim_text":"Large deep neural networks are powerful, but exhibit undesirable behaviors such as memorization and sensitivity to adversarial examples. In this work, we propose mixup, a simple learning principle to alleviate these issues. In essence, mixup trains a neural network on convex combinations of pairs of examples and their labels. By doing so, mixup regularizes the neural network to favor simple linear behavior in-between training examples. Our experiments on the ImageNet-2012, CIFAR-10, CIFAR-100, Google commands and UCI datasets show that mixup improves the generalization of state-of-the-art neur","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks mixup: Beyond Empirical Risk Minimization because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale","work_id":"e96730e3-129b-4db6-b981-15ab7932e297","shared_citers":7},{"title":"Decoupled Weight Decay Regularization","work_id":"07ef7360-d385-4033-83f7-8384a6325204","shared_citers":7},{"title":"Invariant Risk Minimization","work_id":"d76c6842-b84d-44ec-bcea-b80cd8d07981","shared_citers":5},{"title":"Adam: A Method for Stochastic Optimization","work_id":"1910796d-9b52-4683-bf5c-de9632c1028b","shared_citers":4},{"title":"Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift","work_id":"05484516-8937-4cdf-9176-7f8329ef0221","shared_citers":3},{"title":"BEiT: BERT Pre-Training of Image Transformers","work_id":"d74eda3c-bf7e-45f1-a8f1-a0137ecca3f4","shared_citers":3},{"title":"Deep Residual Learning for Image Recognition","work_id":"ae9e5671-23e8-4853-82a4-699b5b8dd639","shared_citers":3},{"title":"Improved Regularization of Convolutional Neural Networks with Cutout","work_id":"a3bf8477-f913-4f6a-8e36-125767300d1f","shared_citers":3},{"title":"arXiv preprint arXiv:1708.04896 (2017)","work_id":"968eeb9d-f5db-4e4c-8bcb-e2c28a6250d1","shared_citers":2},{"title":"arXiv preprint arXiv:1905.04899 (2019)","work_id":"ce71f921-03a9-4994-ba80-2e5f6486559b","shared_citers":2},{"title":"arXiv preprint arXiv:2006.10726 (2020) 2, 3, 12, 13, 36","work_id":"7f343a70-32a2-42dc-945a-66a6feb179fb","shared_citers":2},{"title":"arXiv preprint arXiv:2010.05761 (2020) 4, 8","work_id":"9f465ee9-fbd1-4205-aee5-7ca66f1e7cb9","shared_citers":2},{"title":"DINOv2: Learning Robust Visual Features without Supervision","work_id":"26b304e5-b54a-4f26-be7e-83299eca52e4","shared_citers":2},{"title":"DINOv3","work_id":"c8b07deb-8fe7-4e18-9620-f3569d3529ce","shared_citers":2},{"title":"Distributionally Robust Neural Networks for Group Shifts: On the Importance of Regularization for Worst-Case Generalization","work_id":"b9385d0d-bafd-43d3-8948-4d2da8ee27a0","shared_citers":2},{"title":"FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning","work_id":"fff3953b-5efb-4753-bee4-002f59995810","shared_citers":2},{"title":"Hierarchical Text-Conditional Image Generation with CLIP Latents","work_id":"0c6a768b-70b8-4242-bb0e-459f1008c9fc","shared_citers":2},{"title":"Improved Baselines with Momentum Contrastive Learning","work_id":"f275e715-bcdc-487c-bc31-6f98d8a01f5c","shared_citers":2},{"title":"In: Proceedings of the IEEE/CVF international conference on computer vision","work_id":"abf093d9-2016-4efd-a534-1bfcbcb4a799","shared_citers":2},{"title":"Junnan Li, Richard Socher, and Steven CH Hoi","work_id":"517ccf84-2677-4c02-9951-ed1d3f9aa1c6","shared_citers":2},{"title":"MONAI: An open-source framework for deep learning in healthcare","work_id":"5bb8ee7d-31fa-4bb6-aef2-8b87a6d465b4","shared_citers":2},{"title":"Patchcraft: Exploring texture patch for efficient ai-generated image detection","work_id":"f5883221-d53f-4bba-94d4-bcc918a6f4e9","shared_citers":2},{"title":"Proceedings of the eleventh annual conference on Computational learning theory , pages=","work_id":"9999350e-f57f-46d9-87c6-5e7dddb6b3b9","shared_citers":2},{"title":"Progressive Growing of GANs for Improved Quality, Stability, and Variation","work_id":"5e8c5f57-fe56-4018-ac90-b6b3b849f420","shared_citers":2}],"time_series":[{"n":2,"year":2020},{"n":2,"year":2024},{"n":1,"year":2025},{"n":33,"year":2026}],"dependency_candidates":[]},"authors":[]}}