diff --git a/Cargo.toml b/Cargo.toml
index 3378dc84bc..316161d9a3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -71,6 +71,7 @@ include = [
     "crypto/fipsmodule/ec/ecp_nistz.h",
     "crypto/fipsmodule/ec/ecp_nistz384.h",
     "crypto/fipsmodule/ec/ecp_nistz384.inl",
+    "crypto/fipsmodule/ec/internal.h",
     "crypto/fipsmodule/ec/gfp_p256.c",
     "crypto/fipsmodule/ec/gfp_p384.c",
     "crypto/fipsmodule/ec/p256.c",
@@ -80,6 +81,7 @@ include = [
     "crypto/fipsmodule/ec/p256_shared.h",
     "crypto/fipsmodule/ec/p256_table.h",
     "crypto/fipsmodule/ec/util.h",
+    "crypto/fipsmodule/ec/wnaf.c",
     "crypto/fipsmodule/ecdsa/ecdsa_verify_tests.txt",
     "crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl",
     "crypto/fipsmodule/modes/asm/ghash-armv4.pl",
diff --git a/build.rs b/build.rs
index f7b94108b7..cb9a46922a 100644
--- a/build.rs
+++ b/build.rs
@@ -42,6 +42,7 @@ const RING_SRCS: &[(&[&str], &str)] = &[
     (&[], "crypto/fipsmodule/ec/gfp_p256.c"),
     (&[], "crypto/fipsmodule/ec/gfp_p384.c"),
     (&[], "crypto/fipsmodule/ec/p256.c"),
+    (&[], "crypto/fipsmodule/ec/wnaf.c"),
     (&[], "crypto/limbs/limbs.c"),
     (&[], "crypto/mem.c"),
     (&[], "crypto/poly1305/poly1305.c"),
@@ -903,6 +904,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "LIMBS_reduce_once",
         "LIMBS_select_512_32",
         "LIMBS_shl_mod",
+        "LIMBS_sub_from_assign",
         "LIMBS_sub_mod",
         "LIMBS_window5_split_window",
         "LIMBS_window5_unsplit_window",
@@ -933,6 +935,7 @@ fn prefix_all_symbols(pp: char, prefix_prefix: &str, prefix: &str) -> String {
         "bssl_constant_time_test_main",
         "chacha20_poly1305_open",
         "chacha20_poly1305_seal",
+        "ec_compute_wNAF",
         "fiat_curve25519_adx_mul",
         "fiat_curve25519_adx_square",
         "gcm_ghash_avx",
diff --git a/crypto/fipsmodule/ec/wnaf.c b/crypto/fipsmodule/ec/wnaf.c
new file mode 100644
index 0000000000..d0d7299e00
--- /dev/null
+++ b/crypto/fipsmodule/ec/wnaf.c
@@ -0,0 +1,144 @@
+/* Originally written by Bodo Moeller for the OpenSSL project.
+ * ====================================================================
+ * Copyright (c) 1998-2005 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ *
+ * Portions of the attached software ("Contribution") are developed by
+ * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project.
+ *
+ * The Contribution is licensed pursuant to the OpenSSL open source
+ * license provided above.
+ *
+ * The elliptic curve binary polynomial software is originally written by
+ * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems
+ * Laboratories. */
+
+#include "../bn/internal.h"
+#include "../../internal.h"
+#include "../../limbs/limbs.h"
+
+static int is_bit_set(const Limb limbs[], size_t num_limbs, size_t bit) {
+    size_t i = bit / LIMB_BITS;
+    if (i >= num_limbs) {
+        return 0;
+    }
+    size_t shift = bit % LIMB_BITS;
+    return (limbs[i] >> shift) & 1;
+}
+
+// This file implements the wNAF-based interleaving multi-exponentiation method
+// at:
+//   http://link.springer.com/chapter/10.1007%2F3-540-45537-X_13
+//   http://www.bmoeller.de/pdf/TI-01-08.multiexp.pdf
+
+void ec_compute_wNAF(int8_t *out, const Limb scalar[], size_t scalar_limbs, size_t bits, int w) {
+  // 'int8_t' can represent integers with absolute values less than 2^7.
+  debug_assert_nonsecret(0 < w && w <= 7);
+  debug_assert_nonsecret(bits != 0);
+  int bit = 1 << w;         // 2^w, at most 128
+  int next_bit = bit << 1;  // 2^(w+1), at most 256
+  int mask = next_bit - 1;  // at most 255
+
+  int window_val = ((int)scalar[0]) & mask;
+  for (size_t j = 0; j < bits + 1; j++) {
+    debug_assert_nonsecret(0 <= window_val && window_val <= next_bit);
+    int digit = 0;
+    if (window_val & 1) {
+      debug_assert_nonsecret(0 < window_val && window_val < next_bit);
+      if (window_val & bit) {
+        digit = window_val - next_bit;
+        // We know -next_bit < digit < 0 and window_val - digit = next_bit.
+
+        // modified wNAF
+        if (j + ((size_t)w) + 1 >= bits) {
+          // special case for generating modified wNAFs:
+          // no new bits will be added into window_val,
+          // so using a positive digit here will decrease
+          // the total length of the representation
+
+          digit = window_val & (mask >> 1);
+          // We know 0 < digit < bit and window_val - digit = bit.
+        }
+      } else {
+        digit = window_val;
+        // We know 0 < digit < bit and window_val - digit = 0.
+      }
+
+      window_val -= digit;
+
+      // Now window_val is 0 or 2^(w+1) in standard wNAF generation.
+      // For modified window NAFs, it may also be 2^w.
+      //
+      // See the comments above for the derivation of each of these bounds.
+      debug_assert_nonsecret(window_val == 0 || window_val == next_bit || window_val == bit);
+      debug_assert_nonsecret(-bit < digit && digit < bit);
+
+      // window_val was odd, so digit is also odd.
+      debug_assert_nonsecret(digit & 1);
+    }
+
+    out[j] = (int8_t)digit;
+
+    // Incorporate the next bit. Previously, |window_val| <= |next_bit|, so if
+    // we shift and add at most one copy of |bit|, this will continue to hold
+    // afterwards.
+    window_val >>= 1;
+    window_val += bit * is_bit_set(scalar, scalar_limbs, j + (size_t)w + 1);
+    debug_assert_nonsecret(window_val <= next_bit);
+  }
+
+  // bits + 1 entries should be sufficient to consume all bits.
+  debug_assert_nonsecret(window_val == 0);
+}
diff --git a/crypto/limbs/limbs.c b/crypto/limbs/limbs.c
index df84f0767f..31a44bed98 100644
--- a/crypto/limbs/limbs.c
+++ b/crypto/limbs/limbs.c
@@ -122,6 +122,11 @@ void LIMBS_add_mod(Limb r[], const Limb a[], const Limb b[], const Limb m[],
   }
 }
 
+// r := a - r.
+void LIMBS_sub_from_assign(Limb r[], const Limb a[], size_t num_limbs) {
+    (void)limbs_sub(r, a, r, num_limbs);
+}
+
 void LIMBS_sub_mod(Limb r[], const Limb a[], const Limb b[], const Limb m[],
                    size_t num_limbs) {
   Limb underflow =
diff --git a/src/bits.rs b/src/bits.rs
index 5851aaf593..9bc182ecdf 100644
--- a/src/bits.rs
+++ b/src/bits.rs
@@ -49,7 +49,7 @@ impl BitLength {
 
     /// The number of bits this bit length represents, as a `usize`.
     #[inline]
-    pub fn as_usize_bits(&self) -> usize {
+    pub const fn as_usize_bits(&self) -> usize {
         self.0
     }
 
diff --git a/src/ec/suite_b/ops.rs b/src/ec/suite_b/ops.rs
index cf3c73455c..6a0ea099c1 100644
--- a/src/ec/suite_b/ops.rs
+++ b/src/ec/suite_b/ops.rs
@@ -12,7 +12,12 @@
 // OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
-use crate::{arithmetic::limbs_from_hex, arithmetic::montgomery::*, c, error, limb::*};
+use crate::{
+    arithmetic::{limbs_from_hex, montgomery::*},
+    bits::BitLength,
+    c, error,
+    limb::*,
+};
 use core::marker::PhantomData;
 
 pub use self::elem::*;
@@ -33,6 +38,7 @@ pub type Scalar<E = Unencoded> = elem::Elem<N, E>;
 #[derive(Clone, Copy)]
 pub enum N {}
 
+#[derive(Clone, Copy)]
 pub struct Point {
     // The coordinates are stored in a contiguous array, where the first
     // `ops.num_limbs` elements are the X coordinate, the next
@@ -62,11 +68,15 @@ pub struct CommonOps {
     // In all cases, `r`, `a`, and `b` may all alias each other.
     elem_mul_mont: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb),
     elem_sqr_mont: unsafe extern "C" fn(r: *mut Limb, a: *const Limb),
-
+    point_double_jacobian_impl: unsafe extern "C" fn(r: *mut Limb, a: *const Limb),
     point_add_jacobian_impl: unsafe extern "C" fn(r: *mut Limb, a: *const Limb, b: *const Limb),
 }
 
 impl CommonOps {
+    fn order_bits(&self) -> BitLength {
+        BitLength::from_usize_bits(self.num_limbs * LIMB_BITS)
+    }
+
     #[inline]
     pub fn elem_add<E: Encoding>(&self, a: &mut Elem<E>, b: &Elem<E>) {
         let num_limbs = self.num_limbs;
@@ -128,6 +138,39 @@ impl CommonOps {
         }
     }
 
+    fn point_new_affine(&self, x: &Elem<R>, y: &Elem<R>) -> Point {
+        // `z` is 1 in the Montgomery domain.
+        let z = {
+            let mut acc = Elem::zero();
+            acc.limbs[0] = 1;
+            let mut rr = Elem::zero();
+            rr.limbs[..self.num_limbs].copy_from_slice(&self.q.rr[..self.num_limbs]);
+
+            self.elem_mul(&mut acc, &rr);
+            acc
+        };
+        self.point_new_jacobian(x, y, &z)
+    }
+
+    fn point_new_jacobian(&self, x: &Elem<R>, y: &Elem<R>, z: &Elem<R>) -> Point {
+        let mut r = Point::new_at_infinity();
+        r.xyz[..self.num_limbs].copy_from_slice(&x.limbs[..self.num_limbs]);
+        r.xyz[self.num_limbs..(2 * self.num_limbs)].copy_from_slice(&y.limbs[..self.num_limbs]);
+        r.xyz[(2 * self.num_limbs)..(3 * self.num_limbs)]
+            .copy_from_slice(&z.limbs[..self.num_limbs]);
+        r
+    }
+
+    fn point_double_assign(&self, r: &mut Point) {
+        unsafe { (self.point_double_jacobian_impl)(r.xyz.as_mut_ptr(), r.xyz.as_ptr()) }
+    }
+
+    fn point_add_assign(&self, r: &mut Point, a: &Point) {
+        unsafe {
+            (self.point_add_jacobian_impl)(r.xyz.as_mut_ptr(), r.xyz.as_ptr(), a.xyz.as_ptr())
+        }
+    }
+
     pub fn point_sum(&self, a: &Point, b: &Point) -> Point {
         let mut r = Point::new_at_infinity();
         unsafe {
@@ -136,6 +179,20 @@ impl CommonOps {
         r
     }
 
+    fn point_neg_vartime(&self, a: &Point) -> Point {
+        let mut r = *a;
+        let y = &mut r.xyz[self.num_limbs..(2 * self.num_limbs)];
+        // Negate y.
+        // TODO(perf): The way this is used, `y` is never zero; none of the
+        // curves we support have a point with y == 0, and the caller never
+        // calls this on the point at infinity.
+        let is_nonzero = !y.iter().all(|&limb| limb == 0);
+        if is_nonzero {
+            limbs_sub_from_assign(y, &self.q.p[..self.num_limbs]);
+        };
+        r
+    }
+
     pub fn point_x(&self, p: &Point) -> Elem<R> {
         let mut r = Elem::zero();
         r.limbs[..self.num_limbs].copy_from_slice(&p.xyz[0..self.num_limbs]);
@@ -301,19 +358,6 @@ pub struct PrivateScalarOps {
     pub oneRR_mod_n: Scalar<RR>, // 1 * R**2 (mod n). TOOD: Use One<RR>.
 }
 
-// XXX: Inefficient and unnecessarily depends on `PrivateKeyOps`. TODO: implement interleaved wNAF
-// multiplication.
-fn twin_mul_inefficient(
-    ops: &PrivateKeyOps,
-    g_scalar: &Scalar,
-    p_scalar: &Scalar,
-    p_xy: &(Elem<R>, Elem<R>),
-) -> Point {
-    let scaled_g = ops.point_mul_base(g_scalar);
-    let scaled_p = ops.point_mul(p_scalar, p_xy);
-    ops.common.point_sum(&scaled_g, &scaled_p)
-}
-
 // This assumes n < q < 2*n.
 pub fn elem_reduced_to_scalar(ops: &CommonOps, elem: &Elem<Unencoded>) -> Scalar<Unencoded> {
     let num_limbs = ops.num_limbs;
@@ -438,7 +482,7 @@ prefixed_extern! {
 #[cfg(test)]
 mod tests {
     extern crate alloc;
-    use super::*;
+    use super::{vartime::points_mul_vartime, *};
     use crate::test;
     use alloc::{format, vec, vec::Vec};
 
@@ -839,52 +883,28 @@ mod tests {
 
     #[test]
     fn p256_point_double_test() {
-        prefixed_extern! {
-            fn p256_point_double(
-                r: *mut Limb,   // [p256::COMMON_OPS.num_limbs*3]
-                a: *const Limb, // [p256::COMMON_OPS.num_limbs*3]
-            );
-        }
         point_double_test(
             &p256::PRIVATE_KEY_OPS,
-            p256_point_double,
             test_file!("ops/p256_point_double_tests.txt"),
         );
     }
 
     #[test]
     fn p384_point_double_test() {
-        prefixed_extern! {
-            fn p384_point_double(
-                r: *mut Limb,   // [p384::COMMON_OPS.num_limbs*3]
-                a: *const Limb, // [p384::COMMON_OPS.num_limbs*3]
-            );
-        }
         point_double_test(
             &p384::PRIVATE_KEY_OPS,
-            p384_point_double,
             test_file!("ops/p384_point_double_tests.txt"),
         );
     }
 
-    fn point_double_test(
-        ops: &PrivateKeyOps,
-        point_double: unsafe extern "C" fn(
-            r: *mut Limb,   // [ops.num_limbs*3]
-            a: *const Limb, // [ops.num_limbs*3]
-        ),
-        test_file: test::File,
-    ) {
+    fn point_double_test(ops: &PrivateKeyOps, test_file: test::File) {
         test::run(test_file, |section, test_case| {
             assert_eq!(section, "");
 
-            let a = consume_jacobian_point(ops, test_case, "a");
+            let mut r_actual = consume_jacobian_point(ops, test_case, "a");
             let r_expected = consume_point(ops, test_case, "r");
 
-            let mut r_actual = Point::new_at_infinity();
-            unsafe {
-                point_double(r_actual.xyz.as_mut_ptr(), a.xyz.as_ptr());
-            }
+            ops.common.point_double_assign(&mut r_actual);
 
             assert_point_actual_equals_expected(ops, &r_actual, &r_expected);
 
@@ -897,18 +917,73 @@ mod tests {
         point_mul_tests(
             &p256::PRIVATE_KEY_OPS,
             test_file!("ops/p256_point_mul_tests.txt"),
+            |s, p| p256::PRIVATE_KEY_OPS.point_mul(s, p),
+        );
+    }
+
+    #[test]
+    fn p256_point_mul_g_test() {
+        point_mul_tests(
+            &p256::PRIVATE_KEY_OPS,
+            test_file!("ops/p256_point_mul_tests.txt"),
+            |g_scalar, g| {
+                let p_scalar = Scalar::zero();
+                let p = (Elem::zero(), Elem::zero());
+                points_mul_vartime(&p256::COMMON_OPS, g_scalar, g, &p_scalar, &p)
+            },
         );
     }
 
+    #[test]
+    fn p256_point_mul_p_test() {
+        point_mul_tests(
+            &p256::PRIVATE_KEY_OPS,
+            test_file!("ops/p256_point_mul_tests.txt"),
+            |p_scalar, p| {
+                let g_scalar = Scalar::zero();
+                points_mul_vartime(&p256::COMMON_OPS, &g_scalar, &p256::GENERATOR, p_scalar, p)
+            },
+        );
+    }
     #[test]
     fn p384_point_mul_test() {
         point_mul_tests(
             &p384::PRIVATE_KEY_OPS,
             test_file!("ops/p384_point_mul_tests.txt"),
+            |s, p| p384::PRIVATE_KEY_OPS.point_mul(s, p),
+        );
+    }
+
+    #[test]
+    fn p384_point_mul_g_test() {
+        point_mul_tests(
+            &p384::PRIVATE_KEY_OPS,
+            test_file!("ops/p384_point_mul_tests.txt"),
+            |g_scalar, g| {
+                let p_scalar = Scalar::zero();
+                let p = (Elem::zero(), Elem::zero());
+                points_mul_vartime(&p384::COMMON_OPS, g_scalar, g, &p_scalar, &p)
+            },
+        );
+    }
+
+    #[test]
+    fn p384_point_mul_p_test() {
+        point_mul_tests(
+            &p384::PRIVATE_KEY_OPS,
+            test_file!("ops/p384_point_mul_tests.txt"),
+            |s, p| {
+                let g_scalar = Scalar::zero();
+                points_mul_vartime(&p384::COMMON_OPS, &g_scalar, &p384::GENERATOR, s, p)
+            },
         );
     }
 
-    fn point_mul_tests(ops: &PrivateKeyOps, test_file: test::File) {
+    fn point_mul_tests(
+        ops: &PrivateKeyOps,
+        test_file: test::File,
+        point_mul: impl Fn(&Scalar, &(Elem<R>, Elem<R>)) -> Point,
+    ) {
         test::run(test_file, |section, test_case| {
             assert_eq!(section, "");
             let p_scalar = consume_scalar(ops.common, test_case, "p_scalar");
@@ -919,7 +994,7 @@ mod tests {
                 TestPoint::Affine(x, y) => (x, y),
             };
             let expected_result = consume_point(ops, test_case, "r");
-            let actual_result = ops.point_mul(&p_scalar, &(x, y));
+            let actual_result = point_mul(&p_scalar, &(x, y));
             assert_point_actual_equals_expected(ops, &actual_result, &expected_result);
             Ok(())
         })
@@ -1184,3 +1259,4 @@ mod tests {
 mod elem;
 pub mod p256;
 pub mod p384;
+mod vartime;
diff --git a/src/ec/suite_b/ops/elem.rs b/src/ec/suite_b/ops/elem.rs
index d9c424fb28..aeaf6e7e67 100644
--- a/src/ec/suite_b/ops/elem.rs
+++ b/src/ec/suite_b/ops/elem.rs
@@ -17,6 +17,7 @@ use crate::{
         limbs_from_hex,
         montgomery::{Encoding, ProductEncoding},
     },
+    bits::BitLength,
     limb::{Limb, LIMB_BITS},
 };
 use core::marker::PhantomData;
@@ -128,4 +129,5 @@ pub fn unary_op_from_binary_op_assign<M, E: Encoding>(
     unsafe { f(a.limbs.as_mut_ptr(), a.limbs.as_ptr(), a.limbs.as_ptr()) }
 }
 
-pub const MAX_LIMBS: usize = (384 + (LIMB_BITS - 1)) / LIMB_BITS;
+pub const MAX_BITS: BitLength = BitLength::from_usize_bits(384);
+pub const MAX_LIMBS: usize = (MAX_BITS.as_usize_bits() + (LIMB_BITS - 1)) / LIMB_BITS;
diff --git a/src/ec/suite_b/ops/p256.rs b/src/ec/suite_b/ops/p256.rs
index 70b9cbf9fc..11a97591ca 100644
--- a/src/ec/suite_b/ops/p256.rs
+++ b/src/ec/suite_b/ops/p256.rs
@@ -30,10 +30,16 @@ pub static COMMON_OPS: CommonOps = CommonOps {
 
     elem_mul_mont: p256_mul_mont,
     elem_sqr_mont: p256_sqr_mont,
-
+    point_double_jacobian_impl: p256_point_double,
     point_add_jacobian_impl: p256_point_add,
 };
 
+#[cfg(any(test, not(any(target_arch = "aarch64", target_arch = "x86_64"))))]
+pub(super) static GENERATOR: (Elem<R>, Elem<R>) = (
+    Elem::from_hex("18905f76a53755c679fb732b7762251075ba95fc5fedb60179e730d418a9143c"),
+    Elem::from_hex("8571ff1825885d85d2e88688dd21f3258b4ab8e4ba19e45cddf25357ce95560a"),
+);
+
 pub static PRIVATE_KEY_OPS: PrivateKeyOps = PrivateKeyOps {
     common: &COMMON_OPS,
     elem_inv_squared: p256_elem_inv_squared,
@@ -120,7 +126,8 @@ pub static PUBLIC_SCALAR_OPS: PublicScalarOps = PublicScalarOps {
 
     #[cfg(not(any(target_arch = "aarch64", target_arch = "x86_64")))]
     twin_mul: |g_scalar, p_scalar, p_xy| {
-        twin_mul_inefficient(&PRIVATE_KEY_OPS, g_scalar, p_scalar, p_xy)
+        // TODO: Make use of precomputed multiples of `g` that already exist.
+        vartime::points_mul_vartime(&COMMON_OPS, g_scalar, &GENERATOR, p_scalar, p_xy)
     },
 
     q_minus_n: Elem::from_hex("4319055358e8617b0c46353d039cdaae"),
@@ -293,6 +300,10 @@ prefixed_extern! {
         a: *const Limb, // [3][COMMON_OPS.num_limbs]
         b: *const Limb, // [3][COMMON_OPS.num_limbs]
     );
+    fn p256_point_double(
+         r: *mut Limb,   // [p256::COMMON_OPS.num_limbs*3]
+         a: *const Limb, // [p256::COMMON_OPS.num_limbs*3]
+    );
     fn p256_point_mul(
         r: *mut Limb,          // [3][COMMON_OPS.num_limbs]
         p_scalar: *const Limb, // [COMMON_OPS.num_limbs]
diff --git a/src/ec/suite_b/ops/p384.rs b/src/ec/suite_b/ops/p384.rs
index 54ec00aa8f..467bde68b1 100644
--- a/src/ec/suite_b/ops/p384.rs
+++ b/src/ec/suite_b/ops/p384.rs
@@ -32,10 +32,15 @@ pub static COMMON_OPS: CommonOps = CommonOps {
 ,
     elem_mul_mont: p384_elem_mul_mont,
     elem_sqr_mont: p384_elem_sqr_mont,
-
+    point_double_jacobian_impl: p384_point_double,
     point_add_jacobian_impl: p384_point_add,
 };
 
+pub(super) static GENERATOR: (Elem<R>, Elem<R>) = (
+    Elem::from_hex("4d3aadc2299e1513812ff723614ede2b6454868459a30eff879c3afc541b4d6e20e378e2a0d6ce383dd0756649c0b528"),
+    Elem::from_hex("2b78abc25a15c5e9dd8002263969a840c6c3521968f4ffd98bade7562e83b050a1bfa8bf7bb4a9ac23043dad4b03a4fe"),
+);
+
 pub static PRIVATE_KEY_OPS: PrivateKeyOps = PrivateKeyOps {
     common: &COMMON_OPS,
     elem_inv_squared: p384_elem_inv_squared,
@@ -101,11 +106,6 @@ fn p384_elem_inv_squared(a: &Elem<R>) -> Elem<R> {
 
 fn p384_point_mul_base_impl(a: &Scalar) -> Point {
     // XXX: Not efficient. TODO: Precompute multiples of the generator.
-    const GENERATOR: (Elem<R>, Elem<R>) = (
-        Elem::from_hex("4d3aadc2299e1513812ff723614ede2b6454868459a30eff879c3afc541b4d6e20e378e2a0d6ce383dd0756649c0b528"),
-        Elem::from_hex("2b78abc25a15c5e9dd8002263969a840c6c3521968f4ffd98bade7562e83b050a1bfa8bf7bb4a9ac23043dad4b03a4fe"),
-    );
-
     PRIVATE_KEY_OPS.point_mul(a, &GENERATOR)
 }
 
@@ -123,7 +123,7 @@ pub static PUBLIC_SCALAR_OPS: PublicScalarOps = PublicScalarOps {
     scalar_ops: &SCALAR_OPS,
     public_key_ops: &PUBLIC_KEY_OPS,
     twin_mul: |g_scalar, p_scalar, p_xy| {
-        twin_mul_inefficient(&PRIVATE_KEY_OPS, g_scalar, p_scalar, p_xy)
+        vartime::points_mul_vartime(&COMMON_OPS, g_scalar, &GENERATOR, p_scalar, p_xy)
     },
 
     q_minus_n: Elem::from_hex("389cb27e0bc8d21fa7e5f24cb74f58851313e696333ad68c"),
@@ -291,6 +291,10 @@ prefixed_extern! {
         a: *const Limb, // [3][COMMON_OPS.num_limbs]
         b: *const Limb, // [3][COMMON_OPS.num_limbs]
     );
+    fn p384_point_double(
+        r: *mut Limb,   // [p384::COMMON_OPS.num_limbs*3]
+        a: *const Limb, // [p384::COMMON_OPS.num_limbs*3]
+    );
     fn p384_point_mul(
         r: *mut Limb,          // [3][COMMON_OPS.num_limbs]
         p_scalar: *const Limb, // [COMMON_OPS.num_limbs]
diff --git a/src/ec/suite_b/ops/vartime.rs b/src/ec/suite_b/ops/vartime.rs
new file mode 100644
index 0000000000..b649f8789b
--- /dev/null
+++ b/src/ec/suite_b/ops/vartime.rs
@@ -0,0 +1,136 @@
+// Copyright 2023 Brian Smith.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use super::{CommonOps, Elem, Point, Scalar, MAX_BITS};
+use crate::{arithmetic::montgomery::R, c, limb::Limb};
+
+pub(super) fn points_mul_vartime(
+    ops: &'static CommonOps,
+    g_scalar: &Scalar,
+    g: &(Elem<R>, Elem<R>),
+    p_scalar: &Scalar,
+    p: &(Elem<R>, Elem<R>),
+) -> Point {
+    let mut g_wnaf: [i8; WNAF_MAX_LEN] = [0; WNAF_MAX_LEN];
+    let (g_wnaf, g_precomp) = prepare(ops, g_scalar, g, &mut g_wnaf);
+
+    let mut p_wnaf: [i8; WNAF_MAX_LEN] = [0; WNAF_MAX_LEN];
+    let (p_wnaf, p_precomp) = prepare(ops, p_scalar, p, &mut p_wnaf);
+
+    let mut acc = PointVartime::new_at_infinity(ops);
+    // Iterate from the highest-order digit to the lowest-order digit.
+    g_wnaf
+        .iter()
+        .zip(p_wnaf)
+        .enumerate()
+        .rev()
+        .for_each(|(i, (&g_digit, &p_digit))| {
+            process_digit(ops, &mut acc, g_digit, &g_precomp);
+            process_digit(ops, &mut acc, p_digit, &p_precomp);
+            if i > 0 {
+                acc.double_assign();
+            }
+        });
+    acc.value.unwrap_or_else(Point::new_at_infinity)
+}
+
+const WINDOW_BITS: u32 = 4;
+const WNAF_MAX_LEN: usize = MAX_BITS.as_usize_bits() + 1;
+const PRECOMP_LEN: usize = 1 << (WINDOW_BITS - 1);
+
+fn prepare<'a>(
+    ops: &'static CommonOps,
+    a: &Scalar,
+    (x, y): &(Elem<R>, Elem<R>),
+    wnaf: &'a mut [i8; WNAF_MAX_LEN],
+) -> (&'a [i8], [Point; PRECOMP_LEN]) {
+    let order_bits = ops.order_bits().as_usize_bits();
+    let wnaf = &mut wnaf[..(order_bits + 1)];
+    prefixed_extern! {
+        fn ec_compute_wNAF(out: *mut i8, scalar: *const Limb, scalar_limbs: c::size_t,
+                           order_bits: c::size_t, w: c::int);
+    }
+    unsafe {
+        ec_compute_wNAF(
+            wnaf.as_mut_ptr(),
+            a.limbs.as_ptr(),
+            a.limbs.len(),
+            order_bits,
+            WINDOW_BITS as c::int,
+        );
+    }
+
+    let mut precomp = [Point::new_at_infinity(); PRECOMP_LEN];
+    // Fill `precomp` with `p` and all odd multiples (1 * p, 3 * p, 5 * p, etc.).
+    precomp[0] = ops.point_new_affine(x, y);
+    let mut p2 = precomp[0];
+    ops.point_double_assign(&mut p2);
+    for i in 1..precomp.len() {
+        precomp[i] = ops.point_sum(&p2, &precomp[i - 1]);
+    }
+    (wnaf, precomp)
+}
+
+fn process_digit(
+    ops: &CommonOps,
+    acc: &mut PointVartime,
+    digit: i8,
+    precomp: &[Point; PRECOMP_LEN],
+) {
+    if digit != 0 {
+        debug_assert_eq!(digit & 1, 1);
+        let neg = digit < 0;
+        let idx = usize::try_from(if neg { -digit } else { digit }).unwrap() >> 1;
+        let entry = &precomp[idx];
+        let entry_neg;
+        let entry = if neg {
+            entry_neg = ops.point_neg_vartime(entry);
+            &entry_neg
+        } else {
+            entry
+        };
+        acc.add_assign(entry);
+    }
+}
+
+/// A `Point` with operations optimized for the case where it is the point at
+/// infinity.
+struct PointVartime {
+    ops: &'static CommonOps,
+
+    /// `None` means "definitely the point at infinity." `Some(p)` may or may
+    /// not be the point at infinity. Will be `None` until a nonzero bit of
+    /// the scalar is encountered.
+    value: Option<Point>,
+}
+
+impl PointVartime {
+    pub fn new_at_infinity(ops: &'static CommonOps) -> Self {
+        Self { ops, value: None }
+    }
+
+    pub fn double_assign(&mut self) {
+        if let Some(p) = &mut self.value {
+            self.ops.point_double_assign(p);
+        }
+    }
+
+    pub fn add_assign(&mut self, a: &Point) {
+        if let Some(value) = &mut self.value {
+            self.ops.point_add_assign(value, a);
+        } else {
+            self.value = Some(*a);
+        }
+    }
+}
diff --git a/src/limb.rs b/src/limb.rs
index 5825101121..53212f120a 100644
--- a/src/limb.rs
+++ b/src/limb.rs
@@ -350,6 +350,20 @@ pub(crate) fn limbs_add_assign_mod(a: &mut [Limb], b: &[Limb], m: &[Limb]) {
     unsafe { LIMBS_add_mod(a.as_mut_ptr(), a.as_ptr(), b.as_ptr(), m.as_ptr(), m.len()) }
 }
 
+/// r := a - r.
+pub(crate) fn limbs_sub_from_assign(r: &mut [Limb], a: &[Limb]) {
+    debug_assert_eq!(a.len(), a.len());
+    prefixed_extern! {
+        // `r` and `a` may alias.
+        fn LIMBS_sub_from_assign(
+            r: *mut Limb,
+            a: *const Limb,
+            num_limbs: c::size_t,
+        );
+    }
+    unsafe { LIMBS_sub_from_assign(r.as_mut_ptr(), a.as_ptr(), r.len()) }
+}
+
 prefixed_extern! {
     fn LIMBS_are_zero(a: *const Limb, num_limbs: c::size_t) -> LimbMask;
     fn LIMBS_less_than(a: *const Limb, b: *const Limb, num_limbs: c::size_t) -> LimbMask;