105/-- **Theorem (Phase 3)**: Among recurrence coefficients (a, b) with
 106a ≥ 1 and b ≥ 1, the pair (1, 1) uniquely minimizes max(a, b).
 107No axiom needed — this is pure arithmetic. -/
 108theorem additive_composition_is_minimal (a b : ℕ) (ha : 1 ≤ a) (hb : 1 ≤ b) :
 109    max a b = 1 → a = 1 ∧ b = 1 := by
 110  intro h
 111  constructor
 112  · exact Nat.le_antisymm (by omega) ha
 113  · exact Nat.le_antisymm (by omega) hb
 114
 115/-- The pair (1,1) achieves max = 1. -/
 116theorem min_max_achieved : max 1 1 = 1 := by simp
 117
 118/-- Any other pair has max ≥ 2. -/
 119theorem other_pairs_larger (a b : ℕ) (ha : 1 ≤ a) (hb : 1 ≤ b)
 120    (h : ¬(a = 1 ∧ b = 1)) : 2 ≤ max a b := by omega
 121
 122/-- Construct the uniform scale ladder from forced data. -/
 123noncomputable def hierarchy_forced
 124    (M : NontrivialMultilevelComposition)
 125    (no_free_scale : ∀ j k,
 126      M.levels (j + 1) / M.levels j = M.levels (k + 1) / M.levels k)
 127    (ratio_gt_one : 1 < M.levels 1 / M.levels 0) :
 128    UniformScaleLadder :=
 129  { levels := M.levels
 130    levels_pos := M.levels_pos
 131    ratio := M.levels 1 / M.levels 0
 132    ratio_gt_one := ratio_gt_one
 133    uniform_scaling := fun k => by
 134      have hk := M.levels_pos k
 135      have h0 := M.levels_pos 0
 136      have hratio := no_free_scale k 0
 137      rw [div_eq_div_iff (ne_of_gt hk) (ne_of_gt h0)] at hratio
 138      field_simp; linarith }

papers checked against this theorem (showing 2 of 2)

20% high-entropy tokens suffice for effective RL on LLM reasoning
echoes · 2506.01939

"utilizing only 20% of the tokens while maintaining performance comparable to full-gradient updates... training exclusively on the 80% lowest-entropy tokens leads to a marked decline"
Activation differences steer LLMs without retraining
unclear · 2308.10248

"ActAdd is lightweight: it does not require any machine optimization and works with a single pair of data points"