9  symmetric : ∀ {x}, 0 < x → F x = F x⁻¹
  10  unit0 : F 1 = 0
  11
  12lemma Jcost_unit0 : Jcost 1 = 0 := by
  13  simp [Jcost]
  14
  15lemma Jcost_symm {x : ℝ} (hx : 0 < x) : Jcost x = Jcost x⁻¹ := by
  16  have hx0 : x ≠ 0 := ne_of_gt hx
  17  unfold Jcost
  18  have : (x + x⁻¹) = (x⁻¹ + (x⁻¹)⁻¹) := by
  19    field_simp [hx0]
  20    ring
  21  simp [this]
  22
  23lemma Jcost_eq_sq {x : ℝ} (hx : x ≠ 0) :
  24    Jcost x = (x - 1) ^ 2 / (2 * x) := by
  25  have hx2 : (2 * x) ≠ 0 := mul_ne_zero two_ne_zero hx
  26  have hL : Jcost x * (2 * x) = (x - 1) ^ 2 := by
  27    unfold Jcost
  28    have : ((x + x⁻¹) / 2 - 1) * (2 * x) = (x - 1) ^ 2 := by
  29      field_simp [hx]
  30      ring
  31    simpa using this
  32  have hR : ((x - 1) ^ 2 / (2 * x)) * (2 * x) = (x - 1) ^ 2 := by
  33    field_simp [hx]
  34  refine (mul_right_cancel₀ hx2) ?_;
  35  calc
  36    Jcost x * (2 * x) = (x - 1) ^ 2 := hL
  37    _ = ((x - 1) ^ 2 / (2 * x)) * (2 * x) := by simpa using hR.symm
  38
  39lemma Jcost_nonneg {x : ℝ} (hx : 0 < x) : 0 ≤ Jcost x := by
  40  have hx0 : x ≠ 0 := ne_of_gt hx
  41  have hrepr := Jcost_eq_sq (x := x) hx0
  42  have hnum : 0 ≤ (x - 1) ^ 2 := by exact sq_nonneg (x - 1)

papers checked against this theorem (showing 1 of 1)

Fine-tuning a 175B model needs only a rank-2 update
unclear · 2106.09685

"We use a random Gaussian initialization for A and zero for B, so ΔW = BA is zero at the beginning of training. We then scale ΔW x by α/r."