From eeaead0d2e5d756b2c4aa59e23caf99233123695 Mon Sep 17 00:00:00 2001 From: Efraim Flashner Date: Sun, 24 Dec 2023 12:56:49 +0200 Subject: gnu: rust-ring-0.16: Generate more bundled files. * gnu/packages/crates-io.scm (rust-ring-0.16-sources): New variable. (rust-ring-0.16)[source]: Use rust-ring-0.16-sources. Drop patches and snippet. [arguments]: Remove custom phase generating curve25519 tables. [native-inputs]: Remove field. (rust-rustls-0.20)[native-inputs]: Remove field. * gnu/packages/rust-apps.scm (agate, alfis, maturin, rust-cargo-edit, sniffglue, spotifyd, tealdeer)[native-inputs]: Remove perl. * gnu/packages/patches/rust-ring-0.16-missing-files.patch, gnu/packages/patches/rust-ring-0.16-test-files.patch: Remove files. * gnu/local.mk (dist_patch_DATA): Remove them. Change-Id: I919207b6aacab78602ae18123ab345a34b00863f --- gnu/local.mk | 2 - gnu/packages/crates-io.scm | 213 +- .../patches/rust-ring-0.16-missing-files.patch | 2293 -------------------- .../patches/rust-ring-0.16-test-files.patch | 54 - gnu/packages/rust-apps.scm | 15 +- 5 files changed, 183 insertions(+), 2394 deletions(-) delete mode 100644 gnu/packages/patches/rust-ring-0.16-missing-files.patch delete mode 100644 gnu/packages/patches/rust-ring-0.16-test-files.patch (limited to 'gnu') diff --git a/gnu/local.mk b/gnu/local.mk index 79cb6d8d1b..b8de8c3141 100644 --- a/gnu/local.mk +++ b/gnu/local.mk @@ -2001,8 +2001,6 @@ dist_patch_DATA = \ %D%/packages/patches/rustc-1.54.0-src.patch \ %D%/packages/patches/rust-1.64-fix-riscv64-bootstrap.patch \ %D%/packages/patches/rust-1.70-fix-rustix-build.patch \ - %D%/packages/patches/rust-ring-0.16-missing-files.patch \ - %D%/packages/patches/rust-ring-0.16-test-files.patch \ %D%/packages/patches/rust-ring-0.17-ring-core.patch \ %D%/packages/patches/i3status-rust-enable-unstable-features.patch \ %D%/packages/patches/rust-ndarray-remove-blas-src-dep.patch \ diff --git a/gnu/packages/crates-io.scm b/gnu/packages/crates-io.scm index ba54d7c383..9ed7ddea97 100644 --- a/gnu/packages/crates-io.scm +++ b/gnu/packages/crates-io.scm @@ -64190,33 +64190,187 @@ Digital Signature Algorithm} (ECDSA).") (description "This package provided safe, fast, small crypto using Rust.") (license (list license:isc license:openssl)))) +(define rust-ring-0.16-sources + (let* ((version "0.16.20") + (upstream-source + (origin + (method git-fetch) + (uri (git-reference + (url "https://github.com/briansmith/ring") + (commit "9cc0d45f4d8521f467bb3a621e74b1535e118188"))) + (file-name (git-file-name "rust-ring" version)) + (sha256 + (base32 "1aps05i5308ka03968glnnqr4kdkk2x4ghlg5vrqhl78jm6ivvby"))))) + (origin + (method computed-origin-method) + (file-name (string-append "rust-ring-" version ".tar.gz")) + (sha256 #f) + (uri + (delay + (with-imported-modules '((guix build utils)) + #~(begin + (use-modules (guix build utils)) + (set-path-environment-variable + "PATH" '("bin") + (list #+(canonical-package gzip) + #+(canonical-package tar) + #+perl + #+nasm + #+go + #+clang ; clang-format + #+python2-minimal)) + (setenv "HOME" (getcwd)) + (copy-recursively #+upstream-source + (string-append "ring-" #$version)) + (with-directory-excursion (string-append "ring-" #$version) + (begin + ;; It turns out Guix's nasm works just fine here. + (substitute* "build.rs" + (("./target/tools/nasm") "nasm")) + ;; Files which would be deleted in a snippet: + (delete-file "crypto/curve25519/curve25519_tables.h") + (delete-file "crypto/fipsmodule/ec/ecp_nistz256_table.inl") + ;; Files to be generated in the sources: + (format #t "Generating the missing files ...~%") + (force-output) + (with-directory-excursion "crypto/curve25519" + (with-output-to-file "curve25519_tables.h" + (lambda _ (invoke "python" "make_curve25519_tables.py")))) + (with-directory-excursion "crypto/fipsmodule/ec" + (with-output-to-file "ecp_nistz256_table.inl" + (lambda _ (invoke "go" "run" "make_p256-x86_64-table.go")))) + (format #t "Generating the pregenerated files ...~%") + (force-output) + (mkdir-p "pregenerated/tmp") + + ;; We generate all the files which upstream would normally be + ;; generate by using '(cd pregenerate_asm && cargo clean && + ;; cargo build) ./pregenerate_asm/target/debug/pregenerate_asm' + ;; in order to not include a dependency on cargo when + ;; generating the sources. + (define (prefix script) + (string-append + "pregenerated/" + (string-drop-right + (string-drop script + (string-index-right script #\/)) 3))) + + (for-each + (lambda (script) + (invoke "perl" script "elf" + (string-append (prefix script) "-elf.S")) + (invoke "perl" script "macosx" + (string-append (prefix script) "-macosx.S")) + (invoke "perl" script "nasm" + (string-append + "pregenerated/tmp/" + (string-drop (prefix script) 13) "-nasm.asm"))) + '("crypto/fipsmodule/aes/asm/aesni-x86_64.pl" + "crypto/fipsmodule/aes/asm/vpaes-x86_64.pl" + "crypto/fipsmodule/bn/asm/x86_64-mont.pl" + "crypto/fipsmodule/bn/asm/x86_64-mont5.pl" + "crypto/chacha/asm/chacha-x86_64.pl" + "crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl" + "crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl" + "crypto/fipsmodule/modes/asm/ghash-x86_64.pl" + "crypto/fipsmodule/sha/asm/sha512-x86_64.pl" + "crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl")) + + (invoke "perl" "crypto/fipsmodule/sha/asm/sha512-x86_64.pl" + "elf" "pregenerated/sha256-x86_64-elf.S") + + (invoke "perl" "crypto/fipsmodule/sha/asm/sha512-x86_64.pl" + "macosx" "pregenerated/sha256-x86_64-macosx.S") + + (invoke "perl" "crypto/fipsmodule/sha/asm/sha512-x86_64.pl" + "nasm" "pregenerated/tmp/sha256-x86_64-nasm.asm") + + (for-each + (lambda (script) + (invoke "nasm" "-o" (string-append (prefix script) "obj") + "-f" "win64" "-Xgnu" "-gcv8" script)) + (find-files "pregenerated/tmp" "\\.asm")) + + (for-each + (lambda (script) + (invoke "perl" script "ios64" + (string-append (prefix script) "-ios64.S")) + (invoke "perl" script "linux64" + (string-append (prefix script) "-linux64.S"))) + '("crypto/fipsmodule/aes/asm/aesv8-armx.pl" + "crypto/fipsmodule/modes/asm/ghashv8-armx.pl" + "crypto/fipsmodule/aes/asm/vpaes-armv8.pl" + "crypto/fipsmodule/bn/asm/armv8-mont.pl" + "crypto/chacha/asm/chacha-armv8.pl" + "crypto/fipsmodule/ec/asm/ecp_nistz256-armv8.pl" + "crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl" + "crypto/fipsmodule/sha/asm/sha512-armv8.pl")) + + (invoke "perl" "crypto/fipsmodule/sha/asm/sha512-armv8.pl" + "ios64" "pregenerated/sha256-armv8-ios64.S") + + (invoke "perl" "crypto/fipsmodule/sha/asm/sha512-armv8.pl" + "linux64" "pregenerated/sha256-armv8-linux64.S") + + (for-each + (lambda (script) + (invoke "perl" script "elf" + "-fPIC" "-DOPENSSL_IA32_SSE2" + (string-append (prefix script) "-elf.S")) + (invoke "perl" script "macosx" + "-fPIC" "-DOPENSSL_IA32_SSE2" + (string-append (prefix script) "-macosx.S")) + (invoke "perl" script "win32n" + "-fPIC" "-DOPENSSL_IA32_SSE2" + (string-append + "pregenerated/tmp/" + (string-drop (prefix script) 13) "-win32n.asm"))) + '("crypto/fipsmodule/aes/asm/aesni-x86.pl" + "crypto/fipsmodule/aes/asm/vpaes-x86.pl" + "crypto/fipsmodule/bn/asm/x86-mont.pl" + "crypto/chacha/asm/chacha-x86.pl" + "crypto/fipsmodule/ec/asm/ecp_nistz256-x86.pl" + "crypto/fipsmodule/modes/asm/ghash-x86.pl")) + + (for-each + (lambda (script) + (invoke "nasm" "-o" (string-append (prefix script) "obj") + "-f" "win32" "-Xgnu" "-gcv8" script)) + (find-files "pregenerated/tmp" "-win32n\\.asm")) + + (for-each + (lambda (script) + (invoke "perl" script "ios32" + (string-append (prefix script) "-ios32.S")) + (invoke "perl" script "linux32" + (string-append (prefix script) "-linux32.S"))) + '("crypto/fipsmodule/aes/asm/aesv8-armx.pl" + "crypto/fipsmodule/modes/asm/ghashv8-armx.pl" + "crypto/fipsmodule/aes/asm/bsaes-armv7.pl" + "crypto/fipsmodule/aes/asm/vpaes-armv7.pl" + "crypto/fipsmodule/bn/asm/armv4-mont.pl" + "crypto/chacha/asm/chacha-armv4.pl" + "crypto/fipsmodule/ec/asm/ecp_nistz256-armv4.pl" + "crypto/fipsmodule/modes/asm/ghash-armv4.pl" + "crypto/fipsmodule/sha/asm/sha256-armv4.pl" + "crypto/fipsmodule/sha/asm/sha512-armv4.pl")) + + (format #t "Creating the tarball ...~%") + (force-output) + ;; The other option is to use cargo package --allow-dirty + (with-directory-excursion "../" + (invoke "tar" "czf" #$output + ;; avoid non-determinism in the archive + "--sort=name" "--mtime=@0" + "--owner=root:0" "--group=root:0" + (string-append "ring-" #$version)))))))))))) + (define-public rust-ring-0.16 (package (inherit rust-ring-0.17) (name "rust-ring") (version "0.16.20") - (source - (origin - (method url-fetch) - (uri (crate-uri "ring" version)) - (file-name (string-append name "-" version ".tar.gz")) - (sha256 - (base32 "1z682xp7v38ayq9g9nkbhhfpj6ygralmlx7wdmsfv8rnw99cylrh")) - (patches (search-patches "rust-ring-0.16-missing-files.patch" - "rust-ring-0.16-test-files.patch")) - (modules '((guix build utils))) - (snippet - '(begin - (delete-file-recursively "pregenerated") - ;; Regenerating the curve25519_tables requires python2 and clang-format. - ;; Luckily we've added the script back in the patch. - ;; Rust doesn't provide a clear way to regenerate files located in - ;; source directories, so for now we don't remove the file here. - ;(delete-file "crypto/curve25519/curve25519_tables.h") - ;; Pretend this isn't a relase tarball. - (with-output-to-file ".git" - (lambda _ - (format #t ""))))))) + (source rust-ring-0.16-sources) (arguments `(#:cargo-inputs (("rust-libc" ,rust-libc-0.2) @@ -64229,17 +64383,7 @@ Digital Signature Algorithm} (ECDSA).") ("rust-cc" ,rust-cc-1)) #:cargo-development-inputs (("rust-libc" ,rust-libc-0.2) - ("rust-wasm-bindgen-test" ,rust-wasm-bindgen-test-0.3)) - #:phases - (modify-phases %standard-phases - (add-after 'unpack 'generate-curve25519-tables - (lambda _ - (with-directory-excursion "crypto/curve25519" - (with-output-to-file "curve25519_tables.h" - (lambda _ - (invoke "python" "make_curve25519_tables.py"))))))))) - (native-inputs - (list clang perl python-2)) + ("rust-wasm-bindgen-test" ,rust-wasm-bindgen-test-0.3)))) ;; For a mostly complete list of supported systems see: ;; https://github.com/briansmith/ring/blob/main/.github/workflows/ci.yml#L170 (supported-systems (list "aarch64-linux" "armhf-linux" @@ -67144,8 +67288,7 @@ rustc compiler.") ("rust-env-logger" ,rust-env-logger-0.9) ("rust-log" ,rust-log-0.4) ("rust-rustls-pemfile" ,rust-rustls-pemfile-1) - ("rust-webpki-roots" ,rust-webpki-roots-0.22)))) - (native-inputs (list perl)))) + ("rust-webpki-roots" ,rust-webpki-roots-0.22)))))) (define-public rust-rustls-0.19 (package diff --git a/gnu/packages/patches/rust-ring-0.16-missing-files.patch b/gnu/packages/patches/rust-ring-0.16-missing-files.patch deleted file mode 100644 index fa2f94a801..0000000000 --- a/gnu/packages/patches/rust-ring-0.16-missing-files.patch +++ /dev/null @@ -1,2293 +0,0 @@ -These 4 files exist in the git repository for rust-ring, and are from -the same commit where 0.16.20 is taken from. They were not added to the -include list in Cargo.toml, so they were not added to the tarball. - ---- - crypto/curve25519/make_curve25519_tables.py | 222 +++++ - crypto/fipsmodule/aes/asm/vpaes-armv7.pl | 896 ++++++++++++++++++ - crypto/fipsmodule/aes/asm/vpaes-armv8.pl | 837 ++++++++++++++++ - .../fipsmodule/modes/asm/ghash-neon-armv8.pl | 294 ++++++ - 4 files changed, 2249 insertions(+) - create mode 100755 crypto/curve25519/make_curve25519_tables.py - create mode 100644 crypto/fipsmodule/aes/asm/vpaes-armv7.pl - create mode 100755 crypto/fipsmodule/aes/asm/vpaes-armv8.pl - create mode 100644 crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl - -diff --git a/crypto/curve25519/make_curve25519_tables.py b/crypto/curve25519/make_curve25519_tables.py -new file mode 100755 -index 0000000..50dee2a ---- /dev/null -+++ b/crypto/curve25519/make_curve25519_tables.py -@@ -0,0 +1,222 @@ -+#!/usr/bin/env python -+# coding=utf-8 -+# Copyright (c) 2020, Google Inc. -+# -+# Permission to use, copy, modify, and/or distribute this software for any -+# purpose with or without fee is hereby granted, provided that the above -+# copyright notice and this permission notice appear in all copies. -+# -+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY -+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION -+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -+ -+import StringIO -+import subprocess -+ -+# Base field Z_p -+p = 2**255 - 19 -+ -+def modp_inv(x): -+ return pow(x, p-2, p) -+ -+# Square root of -1 -+modp_sqrt_m1 = pow(2, (p-1) // 4, p) -+ -+# Compute corresponding x-coordinate, with low bit corresponding to -+# sign, or return None on failure -+def recover_x(y, sign): -+ if y >= p: -+ return None -+ x2 = (y*y-1) * modp_inv(d*y*y+1) -+ if x2 == 0: -+ if sign: -+ return None -+ else: -+ return 0 -+ -+ # Compute square root of x2 -+ x = pow(x2, (p+3) // 8, p) -+ if (x*x - x2) % p != 0: -+ x = x * modp_sqrt_m1 % p -+ if (x*x - x2) % p != 0: -+ return None -+ -+ if (x & 1) != sign: -+ x = p - x -+ return x -+ -+# Curve constant -+d = -121665 * modp_inv(121666) % p -+ -+# Base point -+g_y = 4 * modp_inv(5) % p -+g_x = recover_x(g_y, 0) -+ -+# Points are represented as affine tuples (x, y). -+ -+def point_add(P, Q): -+ x1, y1 = P -+ x2, y2 = Q -+ x3 = ((x1*y2 + y1*x2) * modp_inv(1 + d*x1*x2*y1*y2)) % p -+ y3 = ((y1*y2 + x1*x2) * modp_inv(1 - d*x1*x2*y1*y2)) % p -+ return (x3, y3) -+ -+# Computes Q = s * P -+def point_mul(s, P): -+ Q = (0, 1) # Neutral element -+ while s > 0: -+ if s & 1: -+ Q = point_add(Q, P) -+ P = point_add(P, P) -+ s >>= 1 -+ return Q -+ -+def to_bytes(x): -+ ret = bytearray(32) -+ for i in range(len(ret)): -+ ret[i] = x % 256 -+ x >>= 8 -+ assert x == 0 -+ return ret -+ -+def to_ge_precomp(P): -+ # typedef struct { -+ # fe_loose yplusx; -+ # fe_loose yminusx; -+ # fe_loose xy2d; -+ # } ge_precomp; -+ x, y = P -+ return ((y + x) % p, (y - x) % p, (x * y * 2 * d) % p) -+ -+def to_base_25_5(x): -+ limbs = (26, 25, 26, 25, 26, 25, 26, 25, 26, 25) -+ ret = [] -+ for l in limbs: -+ ret.append(x & ((1<>= l -+ assert x == 0 -+ return ret -+ -+def to_base_51(x): -+ ret = [] -+ for _ in range(5): -+ ret.append(x & ((1<<51) - 1)) -+ x >>= 51 -+ assert x == 0 -+ return ret -+ -+def to_literal(x): -+ ret = "{{\n#if defined(BORINGSSL_CURVE25519_64BIT)\n" -+ ret += ", ".join(map(str, to_base_51(x))) -+ ret += "\n#else\n" -+ ret += ", ".join(map(str, to_base_25_5(x))) -+ ret += "\n#endif\n}}" -+ return ret -+ -+def main(): -+ d2 = (2 * d) % p -+ -+ small_precomp = bytearray() -+ for i in range(1, 16): -+ s = (i&1) | ((i&2) << (64-1)) | ((i&4) << (128-2)) | ((i&8) << (192-3)) -+ P = point_mul(s, (g_x, g_y)) -+ small_precomp += to_bytes(P[0]) -+ small_precomp += to_bytes(P[1]) -+ -+ large_precomp = [] -+ for i in range(32): -+ large_precomp.append([]) -+ for j in range(8): -+ P = point_mul((j + 1) << (i * 8), (g_x, g_y)) -+ large_precomp[-1].append(to_ge_precomp(P)) -+ -+ bi_precomp = [] -+ for i in range(8): -+ P = point_mul(2*i + 1, (g_x, g_y)) -+ bi_precomp.append(to_ge_precomp(P)) -+ -+ -+ buf = StringIO.StringIO() -+ buf.write("""/* Copyright (c) 2020, Google Inc. -+ * -+ * Permission to use, copy, modify, and/or distribute this software for any -+ * purpose with or without fee is hereby granted, provided that the above -+ * copyright notice and this permission notice appear in all copies. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY -+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION -+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -+ -+// This file is generated from -+// ./make_curve25519_tables.py > curve25519_tables.h -+ -+ -+static const fe d = """) -+ buf.write(to_literal(d)) -+ buf.write("""; -+ -+static const fe sqrtm1 = """) -+ buf.write(to_literal(modp_sqrt_m1)) -+ buf.write("""; -+ -+static const fe d2 = """) -+ buf.write(to_literal(d2)) -+ buf.write("""; -+ -+#if defined(OPENSSL_SMALL) -+ -+// This block of code replaces the standard base-point table with a much smaller -+// one. The standard table is 30,720 bytes while this one is just 960. -+// -+// This table contains 15 pairs of group elements, (x, y), where each field -+// element is serialised with |fe_tobytes|. If |i| is the index of the group -+// element then consider i+1 as a four-bit number: (i₀, i₁, i₂, i₃) (where i₀ -+// is the most significant bit). The value of the group element is then: -+// (i₀×2^192 + i₁×2^128 + i₂×2^64 + i₃)G, where G is the generator. -+static const uint8_t k25519SmallPrecomp[15 * 2 * 32] = {""") -+ for i, b in enumerate(small_precomp): -+ buf.write("0x%02x, " % b) -+ buf.write(""" -+}; -+ -+#else -+ -+// k25519Precomp[i][j] = (j+1)*256^i*B -+static const ge_precomp k25519Precomp[32][8] = { -+""") -+ for child in large_precomp: -+ buf.write("{\n") -+ for val in child: -+ buf.write("{\n") -+ for term in val: -+ buf.write(to_literal(term) + ",\n") -+ buf.write("},\n") -+ buf.write("},\n") -+ buf.write("""}; -+ -+#endif // OPENSSL_SMALL -+ -+// Bi[i] = (2*i+1)*B -+static const ge_precomp Bi[8] = { -+""") -+ for val in bi_precomp: -+ buf.write("{\n") -+ for term in val: -+ buf.write(to_literal(term) + ",\n") -+ buf.write("},\n") -+ buf.write("""}; -+""") -+ -+ proc = subprocess.Popen(["clang-format"], stdin=subprocess.PIPE) -+ proc.communicate(buf.getvalue()) -+ -+if __name__ == "__main__": -+ main() -diff --git a/crypto/fipsmodule/aes/asm/vpaes-armv7.pl b/crypto/fipsmodule/aes/asm/vpaes-armv7.pl -new file mode 100644 -index 0000000..d36a97a ---- /dev/null -+++ b/crypto/fipsmodule/aes/asm/vpaes-armv7.pl -@@ -0,0 +1,896 @@ -+#! /usr/bin/env perl -+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. -+# -+# Licensed under the OpenSSL license (the "License"). You may not use -+# this file except in compliance with the License. You can obtain a copy -+# in the file LICENSE in the source distribution or at -+# https://www.openssl.org/source/license.html -+ -+ -+###################################################################### -+## Constant-time SSSE3 AES core implementation. -+## version 0.1 -+## -+## By Mike Hamburg (Stanford University), 2009 -+## Public domain. -+## -+## For details see http://shiftleft.org/papers/vector_aes/ and -+## http://crypto.stanford.edu/vpaes/. -+## -+###################################################################### -+# Adapted from the original x86_64 version and 's ARMv8 -+# version. -+# -+# armv7, aarch64, and x86_64 differ in several ways: -+# -+# * x86_64 SSSE3 instructions are two-address (destination operand is also a -+# source), while NEON is three-address (destination operand is separate from -+# two sources). -+# -+# * aarch64 has 32 SIMD registers available, while x86_64 and armv7 have 16. -+# -+# * x86_64 instructions can take memory references, while ARM is a load/store -+# architecture. This means we sometimes need a spare register. -+# -+# * aarch64 and x86_64 have 128-bit byte shuffle instructions (tbl and pshufb), -+# while armv7 only has a 64-bit byte shuffle (vtbl). -+# -+# This means this armv7 version must be a mix of both aarch64 and x86_64 -+# implementations. armv7 and aarch64 have analogous SIMD instructions, so we -+# base the instructions on aarch64. However, we cannot use aarch64's register -+# allocation. x86_64's register count matches, but x86_64 is two-address. -+# vpaes-armv8.pl already accounts for this in the comments, which use -+# three-address AVX instructions instead of the original SSSE3 ones. We base -+# register usage on these comments, which are preserved in this file. -+# -+# This means we do not use separate input and output registers as in aarch64 and -+# cannot pin as many constants in the preheat functions. However, the load/store -+# architecture means we must still deviate from x86_64 in places. -+# -+# Next, we account for the byte shuffle instructions. vtbl takes 64-bit source -+# and destination and 128-bit table. Fortunately, armv7 also allows addressing -+# upper and lower halves of each 128-bit register. The lower half of q{N} is -+# d{2*N}. The upper half is d{2*N+1}. Instead of the following non-existent -+# instruction, -+# -+# vtbl.8 q0, q1, q2 @ Index each of q2's 16 bytes into q1. Store in q0. -+# -+# we write: -+# -+# vtbl.8 d0, q1, d4 @ Index each of d4's 8 bytes into q1. Store in d0. -+# vtbl.8 d1, q1, d5 @ Index each of d5's 8 bytes into q1. Store in d1. -+# -+# For readability, we write d0 and d1 as q0#lo and q0#hi, respectively and -+# post-process before outputting. (This is adapted from ghash-armv4.pl.) Note, -+# however, that destination (q0) and table (q1) registers may no longer match. -+# We adjust the register usage from x86_64 to avoid this. (Unfortunately, the -+# two-address pshufb always matched these operands, so this is common.) -+# -+# This file also runs against the limit of ARMv7's ADR pseudo-instruction. ADR -+# expands to an ADD or SUB of the pc register to find an address. That immediate -+# must fit in ARM's encoding scheme: 8 bits of constant and 4 bits of rotation. -+# This means larger values must be more aligned. -+# -+# ARM additionally has two encodings, ARM and Thumb mode. Our assembly files may -+# use either encoding (do we actually need to support this?). In ARM mode, the -+# distances get large enough to require 16-byte alignment. Moving constants -+# closer to their use resolves most of this, but common constants in -+# _vpaes_consts are used by the whole file. Affected ADR instructions must be -+# placed at 8 mod 16 (the pc register is 8 ahead). Instructions with this -+# constraint have been commented. -+# -+# For details on ARM's immediate value encoding scheme, see -+# https://alisdair.mcdiarmid.org/arm-immediate-value-encoding/ -+# -+# Finally, a summary of armv7 and aarch64 SIMD syntax differences: -+# -+# * armv7 prefixes SIMD instructions with 'v', while aarch64 does not. -+# -+# * armv7 SIMD registers are named like q0 (and d0 for the half-width ones). -+# aarch64 names registers like v0, and denotes half-width operations in an -+# instruction suffix (see below). -+# -+# * aarch64 embeds size and lane information in register suffixes. v0.16b is -+# 16 bytes, v0.8h is eight u16s, v0.4s is four u32s, and v0.2d is two u64s. -+# armv7 embeds the total size in the register name (see above) and the size of -+# each element in an instruction suffix, which may look like vmov.i8, -+# vshr.u8, or vtbl.8, depending on instruction. -+ -+use strict; -+ -+my $flavour = shift; -+my $output; -+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} -+ -+$0 =~ m/(.*[\/\\])[^\/\\]+$/; -+my $dir=$1; -+my $xlate; -+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or -+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or -+die "can't locate arm-xlate.pl"; -+ -+open OUT,"| \"$^X\" $xlate $flavour $output"; -+*STDOUT=*OUT; -+ -+my $code = ""; -+ -+$code.=<<___; -+.syntax unified -+ -+.arch armv7-a -+.fpu neon -+ -+#if defined(__thumb2__) -+.thumb -+#else -+.code 32 -+#endif -+ -+.text -+ -+.type _vpaes_consts,%object -+.align 7 @ totally strategic alignment -+_vpaes_consts: -+.Lk_mc_forward: @ mc_forward -+ .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 -+ .quad 0x080B0A0904070605, 0x000302010C0F0E0D -+ .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 -+ .quad 0x000302010C0F0E0D, 0x080B0A0904070605 -+.Lk_mc_backward:@ mc_backward -+ .quad 0x0605040702010003, 0x0E0D0C0F0A09080B -+ .quad 0x020100030E0D0C0F, 0x0A09080B06050407 -+ .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 -+ .quad 0x0A09080B06050407, 0x020100030E0D0C0F -+.Lk_sr: @ sr -+ .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 -+ .quad 0x030E09040F0A0500, 0x0B06010C07020D08 -+ .quad 0x0F060D040B020900, 0x070E050C030A0108 -+ .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 -+ -+@ -+@ "Hot" constants -+@ -+.Lk_inv: @ inv, inva -+ .quad 0x0E05060F0D080180, 0x040703090A0B0C02 -+ .quad 0x01040A060F0B0780, 0x030D0E0C02050809 -+.Lk_ipt: @ input transform (lo, hi) -+ .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 -+ .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 -+.Lk_sbo: @ sbou, sbot -+ .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 -+ .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA -+.Lk_sb1: @ sb1u, sb1t -+ .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF -+ .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 -+.Lk_sb2: @ sb2u, sb2t -+ .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A -+ .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD -+ -+.asciz "Vector Permutation AES for ARMv7 NEON, Mike Hamburg (Stanford University)" -+.size _vpaes_consts,.-_vpaes_consts -+.align 6 -+___ -+ -+{ -+my ($inp,$out,$key) = map("r$_", (0..2)); -+ -+my ($invlo,$invhi) = map("q$_", (10..11)); -+my ($sb1u,$sb1t,$sb2u,$sb2t) = map("q$_", (12..15)); -+ -+$code.=<<___; -+@@ -+@@ _aes_preheat -+@@ -+@@ Fills q9-q15 as specified below. -+@@ -+.type _vpaes_preheat,%function -+.align 4 -+_vpaes_preheat: -+ adr r10, .Lk_inv -+ vmov.i8 q9, #0x0f @ .Lk_s0F -+ vld1.64 {q10,q11}, [r10]! @ .Lk_inv -+ add r10, r10, #64 @ Skip .Lk_ipt, .Lk_sbo -+ vld1.64 {q12,q13}, [r10]! @ .Lk_sb1 -+ vld1.64 {q14,q15}, [r10] @ .Lk_sb2 -+ bx lr -+ -+@@ -+@@ _aes_encrypt_core -+@@ -+@@ AES-encrypt q0. -+@@ -+@@ Inputs: -+@@ q0 = input -+@@ q9-q15 as in _vpaes_preheat -+@@ [$key] = scheduled keys -+@@ -+@@ Output in q0 -+@@ Clobbers q1-q5, r8-r11 -+@@ Preserves q6-q8 so you get some local vectors -+@@ -+@@ -+.type _vpaes_encrypt_core,%function -+.align 4 -+_vpaes_encrypt_core: -+ mov r9, $key -+ ldr r8, [$key,#240] @ pull rounds -+ adr r11, .Lk_ipt -+ @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo -+ @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi -+ vld1.64 {q2, q3}, [r11] -+ adr r11, .Lk_mc_forward+16 -+ vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key -+ vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 -+ vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 -+ vtbl.8 q1#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm2, %xmm1 -+ vtbl.8 q1#hi, {q2}, q1#hi -+ vtbl.8 q2#lo, {q3}, q0#lo @ vpshufb %xmm0, %xmm3, %xmm2 -+ vtbl.8 q2#hi, {q3}, q0#hi -+ veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0 -+ veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 -+ -+ @ .Lenc_entry ends with a bnz instruction which is normally paired with -+ @ subs in .Lenc_loop. -+ tst r8, r8 -+ b .Lenc_entry -+ -+.align 4 -+.Lenc_loop: -+ @ middle of middle round -+ add r10, r11, #0x40 -+ vtbl.8 q4#lo, {$sb1t}, q2#lo @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u -+ vtbl.8 q4#hi, {$sb1t}, q2#hi -+ vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] -+ vtbl.8 q0#lo, {$sb1u}, q3#lo @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t -+ vtbl.8 q0#hi, {$sb1u}, q3#hi -+ veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k -+ vtbl.8 q5#lo, {$sb2t}, q2#lo @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u -+ vtbl.8 q5#hi, {$sb2t}, q2#hi -+ veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A -+ vtbl.8 q2#lo, {$sb2u}, q3#lo @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t -+ vtbl.8 q2#hi, {$sb2u}, q3#hi -+ vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] -+ vtbl.8 q3#lo, {q0}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B -+ vtbl.8 q3#hi, {q0}, q1#hi -+ veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A -+ @ Write to q5 instead of q0, so the table and destination registers do -+ @ not overlap. -+ vtbl.8 q5#lo, {q0}, q4#lo @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D -+ vtbl.8 q5#hi, {q0}, q4#hi -+ veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B -+ vtbl.8 q4#lo, {q3}, q1#lo @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C -+ vtbl.8 q4#hi, {q3}, q1#hi -+ @ Here we restore the original q0/q5 usage. -+ veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D -+ and r11, r11, #~(1<<6) @ and \$0x30, %r11 # ... mod 4 -+ veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D -+ subs r8, r8, #1 @ nr-- -+ -+.Lenc_entry: -+ @ top of round -+ vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k -+ vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 # 1 = i -+ vtbl.8 q5#lo, {$invhi}, q1#lo @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k -+ vtbl.8 q5#hi, {$invhi}, q1#hi -+ veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j -+ vtbl.8 q3#lo, {$invlo}, q0#lo @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i -+ vtbl.8 q3#hi, {$invlo}, q0#hi -+ vtbl.8 q4#lo, {$invlo}, q1#lo @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j -+ vtbl.8 q4#hi, {$invlo}, q1#hi -+ veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k -+ veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k -+ vtbl.8 q2#lo, {$invlo}, q3#lo @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak -+ vtbl.8 q2#hi, {$invlo}, q3#hi -+ vtbl.8 q3#lo, {$invlo}, q4#lo @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak -+ vtbl.8 q3#hi, {$invlo}, q4#hi -+ veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io -+ veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo -+ vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 -+ bne .Lenc_loop -+ -+ @ middle of last round -+ add r10, r11, #0x80 -+ -+ adr r11, .Lk_sbo -+ @ Read to q1 instead of q4, so the vtbl.8 instruction below does not -+ @ overlap table and destination registers. -+ vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou -+ vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 -+ vtbl.8 q4#lo, {q1}, q2#lo @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou -+ vtbl.8 q4#hi, {q1}, q2#hi -+ vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] -+ @ Write to q2 instead of q0 below, to avoid overlapping table and -+ @ destination registers. -+ vtbl.8 q2#lo, {q0}, q3#lo @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t -+ vtbl.8 q2#hi, {q0}, q3#hi -+ veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k -+ veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A -+ @ Here we restore the original q0/q2 usage. -+ vtbl.8 q0#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm0 -+ vtbl.8 q0#hi, {q2}, q1#hi -+ bx lr -+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core -+ -+.globl GFp_vpaes_encrypt -+.type GFp_vpaes_encrypt,%function -+.align 4 -+GFp_vpaes_encrypt: -+ @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack -+ @ alignment. -+ stmdb sp!, {r7-r11,lr} -+ @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved. -+ vstmdb sp!, {d8-d11} -+ -+ vld1.64 {q0}, [$inp] -+ bl _vpaes_preheat -+ bl _vpaes_encrypt_core -+ vst1.64 {q0}, [$out] -+ -+ vldmia sp!, {d8-d11} -+ ldmia sp!, {r7-r11, pc} @ return -+.size GFp_vpaes_encrypt,.-GFp_vpaes_encrypt -+___ -+} -+{ -+my ($inp,$bits,$out,$dir)=("r0","r1","r2","r3"); -+my ($rcon,$s0F,$invlo,$invhi,$s63) = map("q$_",(8..12)); -+ -+$code.=<<___; -+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ -+@@ @@ -+@@ AES key schedule @@ -+@@ @@ -+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ -+ -+@ This function diverges from both x86_64 and armv7 in which constants are -+@ pinned. x86_64 has a common preheat function for all operations. aarch64 -+@ separates them because it has enough registers to pin nearly all constants. -+@ armv7 does not have enough registers, but needing explicit loads and stores -+@ also complicates using x86_64's register allocation directly. -+@ -+@ We pin some constants for convenience and leave q14 and q15 free to load -+@ others on demand. -+ -+@ -+@ Key schedule constants -+@ -+.type _vpaes_key_consts,%object -+.align 4 -+_vpaes_key_consts: -+.Lk_rcon: @ rcon -+ .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 -+ -+.Lk_opt: @ output transform -+ .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 -+ .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 -+.Lk_deskew: @ deskew tables: inverts the sbox's "skew" -+ .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A -+ .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 -+.size _vpaes_key_consts,.-_vpaes_key_consts -+ -+.type _vpaes_key_preheat,%function -+.align 4 -+_vpaes_key_preheat: -+ adr r11, .Lk_rcon -+ vmov.i8 $s63, #0x5b @ .Lk_s63 -+ adr r10, .Lk_inv @ Must be aligned to 8 mod 16. -+ vmov.i8 $s0F, #0x0f @ .Lk_s0F -+ vld1.64 {$invlo,$invhi}, [r10] @ .Lk_inv -+ vld1.64 {$rcon}, [r11] @ .Lk_rcon -+ bx lr -+.size _vpaes_key_preheat,.-_vpaes_key_preheat -+ -+.type _vpaes_schedule_core,%function -+.align 4 -+_vpaes_schedule_core: -+ @ We only need to save lr, but ARM requires an 8-byte stack alignment, -+ @ so save an extra register. -+ stmdb sp!, {r3,lr} -+ -+ bl _vpaes_key_preheat @ load the tables -+ -+ adr r11, .Lk_ipt @ Must be aligned to 8 mod 16. -+ vld1.64 {q0}, [$inp]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned) -+ -+ @ input transform -+ @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not -+ @ overlap table and destination. -+ vmov q4, q0 @ vmovdqa %xmm0, %xmm3 -+ bl _vpaes_schedule_transform -+ adr r10, .Lk_sr @ Must be aligned to 8 mod 16. -+ vmov q7, q0 @ vmovdqa %xmm0, %xmm7 -+ -+ add r8, r8, r10 -+ -+ @ encrypting, output zeroth round key after transform -+ vst1.64 {q0}, [$out] @ vmovdqu %xmm0, (%rdx) -+ -+ @ *ring*: Decryption removed. -+ -+.Lschedule_go: -+ cmp $bits, #192 @ cmp \$192, %esi -+ bhi .Lschedule_256 -+ @ 128: fall though -+ -+@@ -+@@ .schedule_128 -+@@ -+@@ 128-bit specific part of key schedule. -+@@ -+@@ This schedule is really simple, because all its parts -+@@ are accomplished by the subroutines. -+@@ -+.Lschedule_128: -+ mov $inp, #10 @ mov \$10, %esi -+ -+.Loop_schedule_128: -+ bl _vpaes_schedule_round -+ subs $inp, $inp, #1 @ dec %esi -+ beq .Lschedule_mangle_last -+ bl _vpaes_schedule_mangle @ write output -+ b .Loop_schedule_128 -+ -+@@ -+@@ .aes_schedule_256 -+@@ -+@@ 256-bit specific part of key schedule. -+@@ -+@@ The structure here is very similar to the 128-bit -+@@ schedule, but with an additional "low side" in -+@@ q6. The low side's rounds are the same as the -+@@ high side's, except no rcon and no rotation. -+@@ -+.align 4 -+.Lschedule_256: -+ vld1.64 {q0}, [$inp] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) -+ bl _vpaes_schedule_transform @ input transform -+ mov $inp, #7 @ mov \$7, %esi -+ -+.Loop_schedule_256: -+ bl _vpaes_schedule_mangle @ output low result -+ vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 -+ -+ @ high round -+ bl _vpaes_schedule_round -+ subs $inp, $inp, #1 @ dec %esi -+ beq .Lschedule_mangle_last -+ bl _vpaes_schedule_mangle -+ -+ @ low round. swap xmm7 and xmm6 -+ vdup.32 q0, q0#hi[1] @ vpshufd \$0xFF, %xmm0, %xmm0 -+ vmov.i8 q4, #0 -+ vmov q5, q7 @ vmovdqa %xmm7, %xmm5 -+ vmov q7, q6 @ vmovdqa %xmm6, %xmm7 -+ bl _vpaes_schedule_low_round -+ vmov q7, q5 @ vmovdqa %xmm5, %xmm7 -+ -+ b .Loop_schedule_256 -+ -+@@ -+@@ .aes_schedule_mangle_last -+@@ -+@@ Mangler for last round of key schedule -+@@ Mangles q0 -+@@ when encrypting, outputs out(q0) ^ 63 -+@@ when decrypting, outputs unskew(q0) -+@@ -+@@ Always called right before return... jumps to cleanup and exits -+@@ -+.align 4 -+.Lschedule_mangle_last: -+ @ schedule last round key from xmm0 -+ adr r11, .Lk_deskew @ lea .Lk_deskew(%rip),%r11 # prepare to deskew -+ -+ @ encrypting -+ vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1 -+ adr r11, .Lk_opt @ lea .Lk_opt(%rip), %r11 # prepare to output transform -+ add $out, $out, #32 @ add \$32, %rdx -+ vmov q2, q0 -+ vtbl.8 q0#lo, {q2}, q1#lo @ vpshufb %xmm1, %xmm0, %xmm0 # output permute -+ vtbl.8 q0#hi, {q2}, q1#hi -+ -+.Lschedule_mangle_last_dec: -+ sub $out, $out, #16 @ add \$-16, %rdx -+ veor q0, q0, $s63 @ vpxor .Lk_s63(%rip), %xmm0, %xmm0 -+ bl _vpaes_schedule_transform @ output transform -+ vst1.64 {q0}, [$out] @ vmovdqu %xmm0, (%rdx) # save last key -+ -+ @ cleanup -+ veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0 -+ veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 -+ veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2 -+ veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3 -+ veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4 -+ veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5 -+ veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6 -+ veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7 -+ ldmia sp!, {r3,pc} @ return -+.size _vpaes_schedule_core,.-_vpaes_schedule_core -+ -+@@ -+@@ .aes_schedule_round -+@@ -+@@ Runs one main round of the key schedule on q0, q7 -+@@ -+@@ Specifically, runs subbytes on the high dword of q0 -+@@ then rotates it by one byte and xors into the low dword of -+@@ q7. -+@@ -+@@ Adds rcon from low byte of q8, then rotates q8 for -+@@ next rcon. -+@@ -+@@ Smears the dwords of q7 by xoring the low into the -+@@ second low, result into third, result into highest. -+@@ -+@@ Returns results in q7 = q0. -+@@ Clobbers q1-q4, r11. -+@@ -+.type _vpaes_schedule_round,%function -+.align 4 -+_vpaes_schedule_round: -+ @ extract rcon from xmm8 -+ vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4 -+ vext.8 q1, $rcon, q4, #15 @ vpalignr \$15, %xmm8, %xmm4, %xmm1 -+ vext.8 $rcon, $rcon, $rcon, #15 @ vpalignr \$15, %xmm8, %xmm8, %xmm8 -+ veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 -+ -+ @ rotate -+ vdup.32 q0, q0#hi[1] @ vpshufd \$0xFF, %xmm0, %xmm0 -+ vext.8 q0, q0, q0, #1 @ vpalignr \$1, %xmm0, %xmm0, %xmm0 -+ -+ @ fall through... -+ -+ @ low round: same as high round, but no rotation and no rcon. -+_vpaes_schedule_low_round: -+ @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12. -+ @ We pin other values in _vpaes_key_preheat, so load them now. -+ adr r11, .Lk_sb1 -+ vld1.64 {q14,q15}, [r11] -+ -+ @ smear xmm7 -+ vext.8 q1, q4, q7, #12 @ vpslldq \$4, %xmm7, %xmm1 -+ veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 -+ vext.8 q4, q4, q7, #8 @ vpslldq \$8, %xmm7, %xmm4 -+ -+ @ subbytes -+ vand q1, q0, $s0F @ vpand %xmm9, %xmm0, %xmm1 # 0 = k -+ vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 # 1 = i -+ veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7 -+ vtbl.8 q2#lo, {$invhi}, q1#lo @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k -+ vtbl.8 q2#hi, {$invhi}, q1#hi -+ veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j -+ vtbl.8 q3#lo, {$invlo}, q0#lo @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i -+ vtbl.8 q3#hi, {$invlo}, q0#hi -+ veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k -+ vtbl.8 q4#lo, {$invlo}, q1#lo @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j -+ vtbl.8 q4#hi, {$invlo}, q1#hi -+ veor q7, q7, $s63 @ vpxor .Lk_s63(%rip), %xmm7, %xmm7 -+ vtbl.8 q3#lo, {$invlo}, q3#lo @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak -+ vtbl.8 q3#hi, {$invlo}, q3#hi -+ veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k -+ vtbl.8 q2#lo, {$invlo}, q4#lo @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak -+ vtbl.8 q2#hi, {$invlo}, q4#hi -+ veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io -+ veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo -+ vtbl.8 q4#lo, {q15}, q3#lo @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou -+ vtbl.8 q4#hi, {q15}, q3#hi -+ vtbl.8 q1#lo, {q14}, q2#lo @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t -+ vtbl.8 q1#hi, {q14}, q2#hi -+ veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output -+ -+ @ add in smeared stuff -+ veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0 -+ veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7 -+ bx lr -+.size _vpaes_schedule_round,.-_vpaes_schedule_round -+ -+@@ -+@@ .aes_schedule_transform -+@@ -+@@ Linear-transform q0 according to tables at [r11] -+@@ -+@@ Requires that q9 = 0x0F0F... as in preheat -+@@ Output in q0 -+@@ Clobbers q1, q2, q14, q15 -+@@ -+.type _vpaes_schedule_transform,%function -+.align 4 -+_vpaes_schedule_transform: -+ vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo -+ @ vmovdqa 16(%r11), %xmm1 # hi -+ vand q1, q0, $s0F @ vpand %xmm9, %xmm0, %xmm1 -+ vshr.u8 q0, q0, #4 @ vpsrlb \$4, %xmm0, %xmm0 -+ vtbl.8 q2#lo, {q14}, q1#lo @ vpshufb %xmm1, %xmm2, %xmm2 -+ vtbl.8 q2#hi, {q14}, q1#hi -+ vtbl.8 q0#lo, {q15}, q0#lo @ vpshufb %xmm0, %xmm1, %xmm0 -+ vtbl.8 q0#hi, {q15}, q0#hi -+ veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 -+ bx lr -+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform -+ -+@@ -+@@ .aes_schedule_mangle -+@@ -+@@ Mangles q0 from (basis-transformed) standard version -+@@ to our version. -+@@ -+@@ On encrypt, -+@@ xor with 0x63 -+@@ multiply by circulant 0,1,1,1 -+@@ apply shiftrows transform -+@@ -+@@ On decrypt, -+@@ xor with 0x63 -+@@ multiply by "inverse mixcolumns" circulant E,B,D,9 -+@@ deskew -+@@ apply shiftrows transform -+@@ -+@@ -+@@ Writes out to [r2], and increments or decrements it -+@@ Keeps track of round number mod 4 in r8 -+@@ Preserves q0 -+@@ Clobbers q1-q5 -+@@ -+.type _vpaes_schedule_mangle,%function -+.align 4 -+_vpaes_schedule_mangle: -+ tst $dir, $dir -+ vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later -+ adr r11, .Lk_mc_forward @ Must be aligned to 8 mod 16. -+ vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5 -+ -+ @ encrypting -+ @ Write to q2 so we do not overlap table and destination below. -+ veor q2, q0, $s63 @ vpxor .Lk_s63(%rip), %xmm0, %xmm4 -+ add $out, $out, #16 @ add \$16, %rdx -+ vtbl.8 q4#lo, {q2}, q5#lo @ vpshufb %xmm5, %xmm4, %xmm4 -+ vtbl.8 q4#hi, {q2}, q5#hi -+ vtbl.8 q1#lo, {q4}, q5#lo @ vpshufb %xmm5, %xmm4, %xmm1 -+ vtbl.8 q1#hi, {q4}, q5#hi -+ vtbl.8 q3#lo, {q1}, q5#lo @ vpshufb %xmm5, %xmm1, %xmm3 -+ vtbl.8 q3#hi, {q1}, q5#hi -+ veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4 -+ vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 -+ veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3 -+ -+.Lschedule_mangle_both: -+ @ Write to q2 so table and destination do not overlap. -+ vtbl.8 q2#lo, {q3}, q1#lo @ vpshufb %xmm1, %xmm3, %xmm3 -+ vtbl.8 q2#hi, {q3}, q1#hi -+ add r8, r8, #64-16 @ add \$-16, %r8 -+ and r8, r8, #~(1<<6) @ and \$0x30, %r8 -+ vst1.64 {q2}, [$out] @ vmovdqu %xmm3, (%rdx) -+ bx lr -+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle -+ -+.globl GFp_vpaes_set_encrypt_key -+.type GFp_vpaes_set_encrypt_key,%function -+.align 4 -+GFp_vpaes_set_encrypt_key: -+ stmdb sp!, {r7-r11, lr} -+ vstmdb sp!, {d8-d15} -+ -+ lsr r9, $bits, #5 @ shr \$5,%eax -+ add r9, r9, #5 @ \$5,%eax -+ str r9, [$out,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; -+ -+ mov $dir, #0 @ mov \$0,%ecx -+ mov r8, #0x30 @ mov \$0x30,%r8d -+ bl _vpaes_schedule_core -+ eor r0, r0, r0 -+ -+ vldmia sp!, {d8-d15} -+ ldmia sp!, {r7-r11, pc} @ return -+.size GFp_vpaes_set_encrypt_key,.-GFp_vpaes_set_encrypt_key -+___ -+} -+ -+{ -+my ($out, $inp) = map("r$_", (0..1)); -+my ($s0F, $s63, $s63_raw, $mc_forward) = map("q$_", (9..12)); -+ -+$code .= <<___; -+ -+@ Additional constants for converting to bsaes. -+.type _vpaes_convert_consts,%object -+.align 4 -+_vpaes_convert_consts: -+@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear -+@ transform in the AES S-box. 0x63 is incorporated into the low half of the -+@ table. This was computed with the following script: -+@ -+@ def u64s_to_u128(x, y): -+@ return x | (y << 64) -+@ def u128_to_u64s(w): -+@ return w & ((1<<64)-1), w >> 64 -+@ def get_byte(w, i): -+@ return (w >> (i*8)) & 0xff -+@ def apply_table(table, b): -+@ lo = b & 0xf -+@ hi = b >> 4 -+@ return get_byte(table[0], lo) ^ get_byte(table[1], hi) -+@ def opt(b): -+@ table = [ -+@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808), -+@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0), -+@ ] -+@ return apply_table(table, b) -+@ def rot_byte(b, n): -+@ return 0xff & ((b << n) | (b >> (8-n))) -+@ def skew(x): -+@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^ -+@ rot_byte(x, 4)) -+@ table = [0, 0] -+@ for i in range(16): -+@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8) -+@ table[1] |= skew(opt(i<<4)) << (i*8) -+@ print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[0])) -+@ print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[1])) -+.Lk_opt_then_skew: -+ .quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b -+ .quad 0x1f30062936192f00, 0xb49bad829db284ab -+ -+@ void GFp_vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes); -+.globl GFp_vpaes_encrypt_key_to_bsaes -+.type GFp_vpaes_encrypt_key_to_bsaes,%function -+.align 4 -+GFp_vpaes_encrypt_key_to_bsaes: -+ stmdb sp!, {r11, lr} -+ -+ @ See _vpaes_schedule_core for the key schedule logic. In particular, -+ @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper), -+ @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last -+ @ contain the transformations not in the bsaes representation. This -+ @ function inverts those transforms. -+ @ -+ @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key -+ @ representation, which does not match the other aes_nohw_* -+ @ implementations. The ARM aes_nohw_* stores each 32-bit word -+ @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the -+ @ cost of extra REV and VREV32 operations in little-endian ARM. -+ -+ vmov.i8 $s0F, #0x0f @ Required by _vpaes_schedule_transform -+ adr r2, .Lk_mc_forward @ Must be aligned to 8 mod 16. -+ add r3, r2, 0x90 @ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression) -+ -+ vld1.64 {$mc_forward}, [r2] -+ vmov.i8 $s63, #0x5b @ .Lk_s63 from vpaes-x86_64 -+ adr r11, .Lk_opt @ Must be aligned to 8 mod 16. -+ vmov.i8 $s63_raw, #0x63 @ .LK_s63 without .Lk_ipt applied -+ -+ @ vpaes stores one fewer round count than bsaes, but the number of keys -+ @ is the same. -+ ldr r2, [$inp,#240] -+ add r2, r2, #1 -+ str r2, [$out,#240] -+ -+ @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt). -+ @ Invert this with .Lk_opt. -+ vld1.64 {q0}, [$inp]! -+ bl _vpaes_schedule_transform -+ vrev32.8 q0, q0 -+ vst1.64 {q0}, [$out]! -+ -+ @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied, -+ @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63, -+ @ multiplies by the circulant 0,1,1,1, then applies ShiftRows. -+.Loop_enc_key_to_bsaes: -+ vld1.64 {q0}, [$inp]! -+ -+ @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle -+ @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30. -+ @ We use r3 rather than r8 to avoid a callee-saved register. -+ vld1.64 {q1}, [r3] -+ vtbl.8 q2#lo, {q0}, q1#lo -+ vtbl.8 q2#hi, {q0}, q1#hi -+ add r3, r3, #16 -+ and r3, r3, #~(1<<6) -+ vmov q0, q2 -+ -+ @ Handle the last key differently. -+ subs r2, r2, #1 -+ beq .Loop_enc_key_to_bsaes_last -+ -+ @ Multiply by the circulant. This is its own inverse. -+ vtbl.8 q1#lo, {q0}, $mc_forward#lo -+ vtbl.8 q1#hi, {q0}, $mc_forward#hi -+ vmov q0, q1 -+ vtbl.8 q2#lo, {q1}, $mc_forward#lo -+ vtbl.8 q2#hi, {q1}, $mc_forward#hi -+ veor q0, q0, q2 -+ vtbl.8 q1#lo, {q2}, $mc_forward#lo -+ vtbl.8 q1#hi, {q2}, $mc_forward#hi -+ veor q0, q0, q1 -+ -+ @ XOR and finish. -+ veor q0, q0, $s63 -+ bl _vpaes_schedule_transform -+ vrev32.8 q0, q0 -+ vst1.64 {q0}, [$out]! -+ b .Loop_enc_key_to_bsaes -+ -+.Loop_enc_key_to_bsaes_last: -+ @ The final key does not have a basis transform (note -+ @ .Lschedule_mangle_last inverts the original transform). It only XORs -+ @ 0x63 and applies ShiftRows. The latter was already inverted in the -+ @ loop. Note that, because we act on the original representation, we use -+ @ $s63_raw, not $s63. -+ veor q0, q0, $s63_raw -+ vrev32.8 q0, q0 -+ vst1.64 {q0}, [$out] -+ -+ @ Wipe registers which contained key material. -+ veor q0, q0, q0 -+ veor q1, q1, q1 -+ veor q2, q2, q2 -+ -+ ldmia sp!, {r11, pc} @ return -+.size GFp_vpaes_encrypt_key_to_bsaes,.-GFp_vpaes_encrypt_key_to_bsaes -+___ -+} -+ -+{ -+# Register-passed parameters. -+my ($inp, $out, $len, $key) = map("r$_", 0..3); -+# Temporaries. _vpaes_encrypt_core already uses r8..r11, so overlap $ivec and -+# $tmp. $ctr is r7 because it must be preserved across calls. -+my ($ctr, $ivec, $tmp) = map("r$_", 7..9); -+ -+# void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, -+# const AES_KEY *key, const uint8_t ivec[16]); -+$code .= <<___; -+.globl GFp_vpaes_ctr32_encrypt_blocks -+.type GFp_vpaes_ctr32_encrypt_blocks,%function -+.align 4 -+GFp_vpaes_ctr32_encrypt_blocks: -+ mov ip, sp -+ stmdb sp!, {r7-r11, lr} -+ @ This function uses q4-q7 (d8-d15), which are callee-saved. -+ vstmdb sp!, {d8-d15} -+ -+ cmp $len, #0 -+ @ $ivec is passed on the stack. -+ ldr $ivec, [ip] -+ beq .Lctr32_done -+ -+ @ _vpaes_encrypt_core expects the key in r2, so swap $len and $key. -+ mov $tmp, $key -+ mov $key, $len -+ mov $len, $tmp -+___ -+my ($len, $key) = ($key, $len); -+$code .= <<___; -+ -+ @ Load the IV and counter portion. -+ ldr $ctr, [$ivec, #12] -+ vld1.8 {q7}, [$ivec] -+ -+ bl _vpaes_preheat -+ rev $ctr, $ctr @ The counter is big-endian. -+ -+.Lctr32_loop: -+ vmov q0, q7 -+ vld1.8 {q6}, [$inp]! @ Load input ahead of time -+ bl _vpaes_encrypt_core -+ veor q0, q0, q6 @ XOR input and result -+ vst1.8 {q0}, [$out]! -+ subs $len, $len, #1 -+ @ Update the counter. -+ add $ctr, $ctr, #1 -+ rev $tmp, $ctr -+ vmov.32 q7#hi[1], $tmp -+ bne .Lctr32_loop -+ -+.Lctr32_done: -+ vldmia sp!, {d8-d15} -+ ldmia sp!, {r7-r11, pc} @ return -+.size GFp_vpaes_ctr32_encrypt_blocks,.-GFp_vpaes_ctr32_encrypt_blocks -+___ -+} -+ -+foreach (split("\n",$code)) { -+ s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; -+ print $_,"\n"; -+} -+ -+close STDOUT; -diff --git a/crypto/fipsmodule/aes/asm/vpaes-armv8.pl b/crypto/fipsmodule/aes/asm/vpaes-armv8.pl -new file mode 100755 -index 0000000..b31bbb8 ---- /dev/null -+++ b/crypto/fipsmodule/aes/asm/vpaes-armv8.pl -@@ -0,0 +1,837 @@ -+#! /usr/bin/env perl -+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. -+# -+# Licensed under the OpenSSL license (the "License"). You may not use -+# this file except in compliance with the License. You can obtain a copy -+# in the file LICENSE in the source distribution or at -+# https://www.openssl.org/source/license.html -+ -+ -+###################################################################### -+## Constant-time SSSE3 AES core implementation. -+## version 0.1 -+## -+## By Mike Hamburg (Stanford University), 2009 -+## Public domain. -+## -+## For details see http://shiftleft.org/papers/vector_aes/ and -+## http://crypto.stanford.edu/vpaes/. -+## -+###################################################################### -+# ARMv8 NEON adaptation by -+# -+# Reason for undertaken effort is that there is at least one popular -+# SoC based on Cortex-A53 that doesn't have crypto extensions. -+# -+# CBC enc ECB enc/dec(*) [bit-sliced enc/dec] -+# Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ] -+# Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ] -+# X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ] -+# Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ] -+# Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ] -+# Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ] -+# -+# (*) ECB denotes approximate result for parallelizable modes -+# such as CBC decrypt, CTR, etc.; -+# (**) these results are worse than scalar compiler-generated -+# code, but it's constant-time and therefore preferred; -+# (***) presented for reference/comparison purposes; -+ -+$flavour = shift; -+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} -+ -+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or -+( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or -+die "can't locate arm-xlate.pl"; -+ -+open OUT,"| \"$^X\" $xlate $flavour $output"; -+*STDOUT=*OUT; -+ -+$code.=<<___; -+#include -+ -+.section .rodata -+ -+.type _vpaes_consts,%object -+.align 7 // totally strategic alignment -+_vpaes_consts: -+.Lk_mc_forward: // mc_forward -+ .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 -+ .quad 0x080B0A0904070605, 0x000302010C0F0E0D -+ .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 -+ .quad 0x000302010C0F0E0D, 0x080B0A0904070605 -+.Lk_mc_backward:// mc_backward -+ .quad 0x0605040702010003, 0x0E0D0C0F0A09080B -+ .quad 0x020100030E0D0C0F, 0x0A09080B06050407 -+ .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 -+ .quad 0x0A09080B06050407, 0x020100030E0D0C0F -+.Lk_sr: // sr -+ .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 -+ .quad 0x030E09040F0A0500, 0x0B06010C07020D08 -+ .quad 0x0F060D040B020900, 0x070E050C030A0108 -+ .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 -+ -+// -+// "Hot" constants -+// -+.Lk_inv: // inv, inva -+ .quad 0x0E05060F0D080180, 0x040703090A0B0C02 -+ .quad 0x01040A060F0B0780, 0x030D0E0C02050809 -+.Lk_ipt: // input transform (lo, hi) -+ .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 -+ .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 -+.Lk_sbo: // sbou, sbot -+ .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 -+ .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA -+.Lk_sb1: // sb1u, sb1t -+ .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF -+ .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 -+.Lk_sb2: // sb2u, sb2t -+ .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A -+ .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD -+ -+// -+// Key schedule constants -+// -+.Lk_dksd: // decryption key schedule: invskew x*D -+ .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 -+ .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E -+.Lk_dksb: // decryption key schedule: invskew x*B -+ .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 -+ .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 -+.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 -+ .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 -+ .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 -+.Lk_dks9: // decryption key schedule: invskew x*9 -+ .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC -+ .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE -+ -+.Lk_rcon: // rcon -+ .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 -+ -+.Lk_opt: // output transform -+ .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 -+ .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 -+.Lk_deskew: // deskew tables: inverts the sbox's "skew" -+ .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A -+ .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 -+ -+.asciz "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)" -+.size _vpaes_consts,.-_vpaes_consts -+.align 6 -+ -+.text -+___ -+ -+{ -+my ($inp,$out,$key) = map("x$_",(0..2)); -+ -+my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23)); -+my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27)); -+my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31)); -+ -+$code.=<<___; -+## -+## _aes_preheat -+## -+## Fills register %r10 -> .aes_consts (so you can -fPIC) -+## and %xmm9-%xmm15 as specified below. -+## -+.type _vpaes_encrypt_preheat,%function -+.align 4 -+_vpaes_encrypt_preheat: -+ adrp x10, :pg_hi21:.Lk_inv -+ add x10, x10, :lo12:.Lk_inv -+ movi v17.16b, #0x0f -+ ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv -+ ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo -+ ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 -+ ret -+.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat -+ -+## -+## _aes_encrypt_core -+## -+## AES-encrypt %xmm0. -+## -+## Inputs: -+## %xmm0 = input -+## %xmm9-%xmm15 as in _vpaes_preheat -+## (%rdx) = scheduled keys -+## -+## Output in %xmm0 -+## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax -+## Preserves %xmm6 - %xmm8 so you get some local vectors -+## -+## -+.type _vpaes_encrypt_core,%function -+.align 4 -+_vpaes_encrypt_core: -+ mov x9, $key -+ ldr w8, [$key,#240] // pull rounds -+ adrp x11, :pg_hi21:.Lk_mc_forward+16 -+ add x11, x11, :lo12:.Lk_mc_forward+16 -+ // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo -+ ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key -+ and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 -+ ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 -+ tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 -+ // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi -+ tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 -+ eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 -+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 -+ b .Lenc_entry -+ -+.align 4 -+.Lenc_loop: -+ // middle of middle round -+ add x10, x11, #0x40 -+ tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u -+ ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] -+ tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t -+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k -+ tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u -+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A -+ tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t -+ ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] -+ tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B -+ eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A -+ tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D -+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B -+ tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C -+ eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D -+ and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 -+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D -+ sub w8, w8, #1 // nr-- -+ -+.Lenc_entry: -+ // top of round -+ and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k -+ ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i -+ tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k -+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j -+ tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i -+ tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j -+ eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k -+ eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k -+ tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak -+ tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak -+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io -+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo -+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 -+ cbnz w8, .Lenc_loop -+ -+ // middle of last round -+ add x10, x11, #0x80 -+ // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo -+ // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 -+ tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou -+ ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] -+ tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t -+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k -+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A -+ tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 -+ ret -+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core -+ -+.globl GFp_vpaes_encrypt -+.type GFp_vpaes_encrypt,%function -+.align 4 -+GFp_vpaes_encrypt: -+ AARCH64_SIGN_LINK_REGISTER -+ stp x29,x30,[sp,#-16]! -+ add x29,sp,#0 -+ -+ ld1 {v7.16b}, [$inp] -+ bl _vpaes_encrypt_preheat -+ bl _vpaes_encrypt_core -+ st1 {v0.16b}, [$out] -+ -+ ldp x29,x30,[sp],#16 -+ AARCH64_VALIDATE_LINK_REGISTER -+ ret -+.size GFp_vpaes_encrypt,.-GFp_vpaes_encrypt -+ -+.type _vpaes_encrypt_2x,%function -+.align 4 -+_vpaes_encrypt_2x: -+ mov x9, $key -+ ldr w8, [$key,#240] // pull rounds -+ adrp x11, :pg_hi21:.Lk_mc_forward+16 -+ add x11, x11, :lo12:.Lk_mc_forward+16 -+ // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo -+ ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key -+ and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 -+ ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 -+ and v9.16b, v15.16b, v17.16b -+ ushr v8.16b, v15.16b, #4 -+ tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 -+ tbl v9.16b, {$iptlo}, v9.16b -+ // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi -+ tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 -+ tbl v10.16b, {$ipthi}, v8.16b -+ eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 -+ eor v8.16b, v9.16b, v16.16b -+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 -+ eor v8.16b, v8.16b, v10.16b -+ b .Lenc_2x_entry -+ -+.align 4 -+.Lenc_2x_loop: -+ // middle of middle round -+ add x10, x11, #0x40 -+ tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u -+ tbl v12.16b, {$sb1t}, v10.16b -+ ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] -+ tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t -+ tbl v8.16b, {$sb1u}, v11.16b -+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k -+ eor v12.16b, v12.16b, v16.16b -+ tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u -+ tbl v13.16b, {$sb2t}, v10.16b -+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A -+ eor v8.16b, v8.16b, v12.16b -+ tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t -+ tbl v10.16b, {$sb2u}, v11.16b -+ ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] -+ tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B -+ tbl v11.16b, {v8.16b}, v1.16b -+ eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A -+ eor v10.16b, v10.16b, v13.16b -+ tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D -+ tbl v8.16b, {v8.16b}, v4.16b -+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B -+ eor v11.16b, v11.16b, v10.16b -+ tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C -+ tbl v12.16b, {v11.16b},v1.16b -+ eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D -+ eor v8.16b, v8.16b, v11.16b -+ and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 -+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D -+ eor v8.16b, v8.16b, v12.16b -+ sub w8, w8, #1 // nr-- -+ -+.Lenc_2x_entry: -+ // top of round -+ and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k -+ ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i -+ and v9.16b, v8.16b, v17.16b -+ ushr v8.16b, v8.16b, #4 -+ tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k -+ tbl v13.16b, {$invhi},v9.16b -+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j -+ eor v9.16b, v9.16b, v8.16b -+ tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i -+ tbl v11.16b, {$invlo},v8.16b -+ tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j -+ tbl v12.16b, {$invlo},v9.16b -+ eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k -+ eor v11.16b, v11.16b, v13.16b -+ eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k -+ eor v12.16b, v12.16b, v13.16b -+ tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak -+ tbl v10.16b, {$invlo},v11.16b -+ tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak -+ tbl v11.16b, {$invlo},v12.16b -+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io -+ eor v10.16b, v10.16b, v9.16b -+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo -+ eor v11.16b, v11.16b, v8.16b -+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 -+ cbnz w8, .Lenc_2x_loop -+ -+ // middle of last round -+ add x10, x11, #0x80 -+ // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo -+ // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 -+ tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou -+ tbl v12.16b, {$sbou}, v10.16b -+ ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] -+ tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t -+ tbl v8.16b, {$sbot}, v11.16b -+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k -+ eor v12.16b, v12.16b, v16.16b -+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A -+ eor v8.16b, v8.16b, v12.16b -+ tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 -+ tbl v1.16b, {v8.16b},v1.16b -+ ret -+.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x -+___ -+} -+{ -+my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3"); -+my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8)); -+ -+$code.=<<___; -+######################################################## -+## ## -+## AES key schedule ## -+## ## -+######################################################## -+.type _vpaes_key_preheat,%function -+.align 4 -+_vpaes_key_preheat: -+ adrp x10, :pg_hi21:.Lk_inv -+ add x10, x10, :lo12:.Lk_inv -+ movi v16.16b, #0x5b // .Lk_s63 -+ adrp x11, :pg_hi21:.Lk_sb1 -+ add x11, x11, :lo12:.Lk_sb1 -+ movi v17.16b, #0x0f // .Lk_s0F -+ ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt -+ adrp x10, :pg_hi21:.Lk_dksd -+ add x10, x10, :lo12:.Lk_dksd -+ ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1 -+ adrp x11, :pg_hi21:.Lk_mc_forward -+ add x11, x11, :lo12:.Lk_mc_forward -+ ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb -+ ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 -+ ld1 {v8.2d}, [x10] // .Lk_rcon -+ ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] -+ ret -+.size _vpaes_key_preheat,.-_vpaes_key_preheat -+ -+.type _vpaes_schedule_core,%function -+.align 4 -+_vpaes_schedule_core: -+ AARCH64_SIGN_LINK_REGISTER -+ stp x29, x30, [sp,#-16]! -+ add x29,sp,#0 -+ -+ bl _vpaes_key_preheat // load the tables -+ -+ ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) -+ -+ // input transform -+ mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 -+ bl _vpaes_schedule_transform -+ mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 -+ -+ adrp x10, :pg_hi21:.Lk_sr // lea .Lk_sr(%rip),%r10 -+ add x10, x10, :lo12:.Lk_sr -+ -+ add x8, x8, x10 -+ -+ // encrypting, output zeroth round key after transform -+ st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) -+ -+ cmp $bits, #192 // cmp \$192, %esi -+ b.hi .Lschedule_256 -+ b.eq .Lschedule_192 -+ // 128: fall though -+ -+## -+## .schedule_128 -+## -+## 128-bit specific part of key schedule. -+## -+## This schedule is really simple, because all its parts -+## are accomplished by the subroutines. -+## -+.Lschedule_128: -+ mov $inp, #10 // mov \$10, %esi -+ -+.Loop_schedule_128: -+ sub $inp, $inp, #1 // dec %esi -+ bl _vpaes_schedule_round -+ cbz $inp, .Lschedule_mangle_last -+ bl _vpaes_schedule_mangle // write output -+ b .Loop_schedule_128 -+ -+## -+## .aes_schedule_192 -+## -+## 192-bit specific part of key schedule. -+## -+## The main body of this schedule is the same as the 128-bit -+## schedule, but with more smearing. The long, high side is -+## stored in %xmm7 as before, and the short, low side is in -+## the high bits of %xmm6. -+## -+## This schedule is somewhat nastier, however, because each -+## round produces 192 bits of key material, or 1.5 round keys. -+## Therefore, on each cycle we do 2 rounds and produce 3 round -+## keys. -+## -+.align 4 -+.Lschedule_192: -+ sub $inp, $inp, #8 -+ ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) -+ bl _vpaes_schedule_transform // input transform -+ mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part -+ eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 -+ ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros -+ mov $inp, #4 // mov \$4, %esi -+ -+.Loop_schedule_192: -+ sub $inp, $inp, #1 // dec %esi -+ bl _vpaes_schedule_round -+ ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0 -+ bl _vpaes_schedule_mangle // save key n -+ bl _vpaes_schedule_192_smear -+ bl _vpaes_schedule_mangle // save key n+1 -+ bl _vpaes_schedule_round -+ cbz $inp, .Lschedule_mangle_last -+ bl _vpaes_schedule_mangle // save key n+2 -+ bl _vpaes_schedule_192_smear -+ b .Loop_schedule_192 -+ -+## -+## .aes_schedule_256 -+## -+## 256-bit specific part of key schedule. -+## -+## The structure here is very similar to the 128-bit -+## schedule, but with an additional "low side" in -+## %xmm6. The low side's rounds are the same as the -+## high side's, except no rcon and no rotation. -+## -+.align 4 -+.Lschedule_256: -+ ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) -+ bl _vpaes_schedule_transform // input transform -+ mov $inp, #7 // mov \$7, %esi -+ -+.Loop_schedule_256: -+ sub $inp, $inp, #1 // dec %esi -+ bl _vpaes_schedule_mangle // output low result -+ mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 -+ -+ // high round -+ bl _vpaes_schedule_round -+ cbz $inp, .Lschedule_mangle_last -+ bl _vpaes_schedule_mangle -+ -+ // low round. swap xmm7 and xmm6 -+ dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 -+ movi v4.16b, #0 -+ mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 -+ mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 -+ bl _vpaes_schedule_low_round -+ mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 -+ -+ b .Loop_schedule_256 -+ -+## -+## .aes_schedule_mangle_last -+## -+## Mangler for last round of key schedule -+## Mangles %xmm0 -+## when encrypting, outputs out(%xmm0) ^ 63 -+## when decrypting, outputs unskew(%xmm0) -+## -+## Always called right before return... jumps to cleanup and exits -+## -+.align 4 -+.Lschedule_mangle_last: -+ // schedule last round key from xmm0 -+ adrp x11, :pg_hi21:.Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew -+ add x11, x11, :lo12:.Lk_deskew -+ -+ cbnz $dir, .Lschedule_mangle_last_dec -+ -+ // encrypting -+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 -+ adrp x11, :pg_hi21:.Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform -+ add x11, x11, :lo12:.Lk_opt -+ add $out, $out, #32 // add \$32, %rdx -+ tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute -+ -+.Lschedule_mangle_last_dec: -+ ld1 {v20.2d-v21.2d}, [x11] // reload constants -+ sub $out, $out, #16 // add \$-16, %rdx -+ eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 -+ bl _vpaes_schedule_transform // output transform -+ st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key -+ -+ // cleanup -+ eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 -+ eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 -+ eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 -+ eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 -+ eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 -+ eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 -+ eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 -+ eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 -+ ldp x29, x30, [sp],#16 -+ AARCH64_VALIDATE_LINK_REGISTER -+ ret -+.size _vpaes_schedule_core,.-_vpaes_schedule_core -+ -+## -+## .aes_schedule_192_smear -+## -+## Smear the short, low side in the 192-bit key schedule. -+## -+## Inputs: -+## %xmm7: high side, b a x y -+## %xmm6: low side, d c 0 0 -+## %xmm13: 0 -+## -+## Outputs: -+## %xmm6: b+c+d b+c 0 0 -+## %xmm0: b+c+d b+c b a -+## -+.type _vpaes_schedule_192_smear,%function -+.align 4 -+_vpaes_schedule_192_smear: -+ movi v1.16b, #0 -+ dup v0.4s, v7.s[3] -+ ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 -+ ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a -+ eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 -+ eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 -+ eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a -+ mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 -+ ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros -+ ret -+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear -+ -+## -+## .aes_schedule_round -+## -+## Runs one main round of the key schedule on %xmm0, %xmm7 -+## -+## Specifically, runs subbytes on the high dword of %xmm0 -+## then rotates it by one byte and xors into the low dword of -+## %xmm7. -+## -+## Adds rcon from low byte of %xmm8, then rotates %xmm8 for -+## next rcon. -+## -+## Smears the dwords of %xmm7 by xoring the low into the -+## second low, result into third, result into highest. -+## -+## Returns results in %xmm7 = %xmm0. -+## Clobbers %xmm1-%xmm4, %r11. -+## -+.type _vpaes_schedule_round,%function -+.align 4 -+_vpaes_schedule_round: -+ // extract rcon from xmm8 -+ movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 -+ ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1 -+ ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8 -+ eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 -+ -+ // rotate -+ dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0 -+ ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0 -+ -+ // fall through... -+ -+ // low round: same as high round, but no rotation and no rcon. -+_vpaes_schedule_low_round: -+ // smear xmm7 -+ ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1 -+ eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 -+ ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4 -+ -+ // subbytes -+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k -+ ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i -+ eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 -+ tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k -+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j -+ tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i -+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k -+ tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j -+ eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 -+ tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak -+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k -+ tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak -+ eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io -+ eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo -+ tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou -+ tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t -+ eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output -+ -+ // add in smeared stuff -+ eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 -+ eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 -+ ret -+.size _vpaes_schedule_round,.-_vpaes_schedule_round -+ -+## -+## .aes_schedule_transform -+## -+## Linear-transform %xmm0 according to tables at (%r11) -+## -+## Requires that %xmm9 = 0x0F0F... as in preheat -+## Output in %xmm0 -+## Clobbers %xmm1, %xmm2 -+## -+.type _vpaes_schedule_transform,%function -+.align 4 -+_vpaes_schedule_transform: -+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 -+ ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 -+ // vmovdqa (%r11), %xmm2 # lo -+ tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 -+ // vmovdqa 16(%r11), %xmm1 # hi -+ tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 -+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 -+ ret -+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform -+ -+## -+## .aes_schedule_mangle -+## -+## Mangle xmm0 from (basis-transformed) standard version -+## to our version. -+## -+## On encrypt, -+## xor with 0x63 -+## multiply by circulant 0,1,1,1 -+## apply shiftrows transform -+## -+## On decrypt, -+## xor with 0x63 -+## multiply by "inverse mixcolumns" circulant E,B,D,9 -+## deskew -+## apply shiftrows transform -+## -+## -+## Writes out to (%rdx), and increments or decrements it -+## Keeps track of round number mod 4 in %r8 -+## Preserves xmm0 -+## Clobbers xmm1-xmm5 -+## -+.type _vpaes_schedule_mangle,%function -+.align 4 -+_vpaes_schedule_mangle: -+ mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later -+ // vmovdqa .Lk_mc_forward(%rip),%xmm5 -+ -+ // encrypting -+ eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 -+ add $out, $out, #16 // add \$16, %rdx -+ tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 -+ tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 -+ tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 -+ eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 -+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 -+ eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 -+ -+.Lschedule_mangle_both: -+ tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 -+ add x8, x8, #64-16 // add \$-16, %r8 -+ and x8, x8, #~(1<<6) // and \$0x30, %r8 -+ st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx) -+ ret -+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle -+ -+.globl GFp_vpaes_set_encrypt_key -+.type GFp_vpaes_set_encrypt_key,%function -+.align 4 -+GFp_vpaes_set_encrypt_key: -+ AARCH64_SIGN_LINK_REGISTER -+ stp x29,x30,[sp,#-16]! -+ add x29,sp,#0 -+ stp d8,d9,[sp,#-16]! // ABI spec says so -+ -+ lsr w9, $bits, #5 // shr \$5,%eax -+ add w9, w9, #5 // \$5,%eax -+ str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; -+ -+ mov $dir, #0 // mov \$0,%ecx -+ mov x8, #0x30 // mov \$0x30,%r8d -+ bl _vpaes_schedule_core -+ eor x0, x0, x0 -+ -+ ldp d8,d9,[sp],#16 -+ ldp x29,x30,[sp],#16 -+ AARCH64_VALIDATE_LINK_REGISTER -+ ret -+.size GFp_vpaes_set_encrypt_key,.-GFp_vpaes_set_encrypt_key -+___ -+} -+{ -+my ($inp,$out,$len,$key,$ivec) = map("x$_",(0..4)); -+my ($ctr, $ctr_tmp) = ("w6", "w7"); -+ -+# void GFp_vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, -+# const AES_KEY *key, const uint8_t ivec[16]); -+$code.=<<___; -+.globl GFp_vpaes_ctr32_encrypt_blocks -+.type GFp_vpaes_ctr32_encrypt_blocks,%function -+.align 4 -+GFp_vpaes_ctr32_encrypt_blocks: -+ AARCH64_SIGN_LINK_REGISTER -+ stp x29,x30,[sp,#-16]! -+ add x29,sp,#0 -+ stp d8,d9,[sp,#-16]! // ABI spec says so -+ stp d10,d11,[sp,#-16]! -+ stp d12,d13,[sp,#-16]! -+ stp d14,d15,[sp,#-16]! -+ -+ cbz $len, .Lctr32_done -+ -+ // Note, unlike the other functions, $len here is measured in blocks, -+ // not bytes. -+ mov x17, $len -+ mov x2, $key -+ -+ // Load the IV and counter portion. -+ ldr $ctr, [$ivec, #12] -+ ld1 {v7.16b}, [$ivec] -+ -+ bl _vpaes_encrypt_preheat -+ tst x17, #1 -+ rev $ctr, $ctr // The counter is big-endian. -+ b.eq .Lctr32_prep_loop -+ -+ // Handle one block so the remaining block count is even for -+ // _vpaes_encrypt_2x. -+ ld1 {v6.16b}, [$inp], #16 // Load input ahead of time -+ bl _vpaes_encrypt_core -+ eor v0.16b, v0.16b, v6.16b // XOR input and result -+ st1 {v0.16b}, [$out], #16 -+ subs x17, x17, #1 -+ // Update the counter. -+ add $ctr, $ctr, #1 -+ rev $ctr_tmp, $ctr -+ mov v7.s[3], $ctr_tmp -+ b.ls .Lctr32_done -+ -+.Lctr32_prep_loop: -+ // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x -+ // uses v14 and v15. -+ mov v15.16b, v7.16b -+ mov v14.16b, v7.16b -+ add $ctr, $ctr, #1 -+ rev $ctr_tmp, $ctr -+ mov v15.s[3], $ctr_tmp -+ -+.Lctr32_loop: -+ ld1 {v6.16b,v7.16b}, [$inp], #32 // Load input ahead of time -+ bl _vpaes_encrypt_2x -+ eor v0.16b, v0.16b, v6.16b // XOR input and result -+ eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) -+ st1 {v0.16b,v1.16b}, [$out], #32 -+ subs x17, x17, #2 -+ // Update the counter. -+ add $ctr_tmp, $ctr, #1 -+ add $ctr, $ctr, #2 -+ rev $ctr_tmp, $ctr_tmp -+ mov v14.s[3], $ctr_tmp -+ rev $ctr_tmp, $ctr -+ mov v15.s[3], $ctr_tmp -+ b.hi .Lctr32_loop -+ -+.Lctr32_done: -+ ldp d14,d15,[sp],#16 -+ ldp d12,d13,[sp],#16 -+ ldp d10,d11,[sp],#16 -+ ldp d8,d9,[sp],#16 -+ ldp x29,x30,[sp],#16 -+ AARCH64_VALIDATE_LINK_REGISTER -+ ret -+.size GFp_vpaes_ctr32_encrypt_blocks,.-GFp_vpaes_ctr32_encrypt_blocks -+___ -+} -+ -+print $code; -+ -+close STDOUT or die "error closing STDOUT"; -diff --git a/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl b/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl -new file mode 100644 -index 0000000..7e52ad6 ---- /dev/null -+++ b/crypto/fipsmodule/modes/asm/ghash-neon-armv8.pl -@@ -0,0 +1,294 @@ -+#! /usr/bin/env perl -+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. -+# -+# Licensed under the OpenSSL license (the "License"). You may not use -+# this file except in compliance with the License. You can obtain a copy -+# in the file LICENSE in the source distribution or at -+# https://www.openssl.org/source/license.html -+ -+# ==================================================================== -+# Written by Andy Polyakov for the OpenSSL -+# project. The module is, however, dual licensed under OpenSSL and -+# CRYPTOGAMS licenses depending on where you obtain it. For further -+# details see http://www.openssl.org/~appro/cryptogams/. -+# ==================================================================== -+ -+# This file was adapted to AArch64 from the 32-bit version in ghash-armv4.pl. It -+# implements the multiplication algorithm described in: -+# -+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software -+# Polynomial Multiplication on ARM Processors using the NEON Engine. -+# -+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf -+# -+# The main distinction to keep in mind between 32-bit NEON and AArch64 SIMD is -+# AArch64 cannot compute over the upper halves of SIMD registers. In 32-bit -+# NEON, the low and high halves of the 128-bit register q0 are accessible as -+# 64-bit registers d0 and d1, respectively. In AArch64, dN is the lower half of -+# vN. Where the 32-bit version would use the upper half, this file must keep -+# halves in separate registers. -+# -+# The other distinction is in syntax. 32-bit NEON embeds lane information in the -+# instruction name, while AArch64 uses suffixes on the registers. For instance, -+# left-shifting 64-bit lanes of a SIMD register in 32-bit would be written: -+# -+# vshl.i64 q0, q0, #1 -+# -+# in 64-bit, it would be written: -+# -+# shl v0.2d, v0.2d, #1 -+# -+# See Programmer's Guide for ARMv8-A, section 7 for details. -+# http://infocenter.arm.com/help/topic/com.arm.doc.den0024a/DEN0024A_v8_architecture_PG.pdf -+# -+# Finally, note the 8-bit and 64-bit polynomial multipliers in AArch64 differ -+# only by suffix. pmull vR.8h, vA.8b, vB.8b multiplies eight 8-bit polynomials -+# and is always available. pmull vR.1q, vA.1d, vB.1d multiplies a 64-bit -+# polynomial and is conditioned on the PMULL extension. This file emulates the -+# latter with the former. -+ -+use strict; -+ -+my $flavour = shift; -+my $output; -+if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } -+else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } -+ -+if ($flavour && $flavour ne "void") { -+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; -+ my $dir = $1; -+ my $xlate; -+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or -+ ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or -+ die "can't locate arm-xlate.pl"; -+ -+ open OUT,"| \"$^X\" $xlate $flavour $output"; -+ *STDOUT=*OUT; -+} else { -+ open OUT,">$output"; -+ *STDOUT=*OUT; -+} -+ -+my ($Xi, $Htbl, $inp, $len) = map("x$_", (0..3)); # argument block -+my ($Xl, $Xm, $Xh, $INlo, $INhi) = map("v$_", (0..4)); -+my ($Hlo, $Hhi, $Hhl) = map("v$_", (5..7)); -+# d8-d15 are callee-saved, so avoid v8-v15. AArch64 SIMD has plenty of registers -+# to spare. -+my ($t0, $t1, $t2, $t3) = map("v$_", (16..19)); -+my ($t0l_t1l, $t0h_t1h, $t2l_t3l, $t2h_t3h) = map("v$_", (20..23)); -+my ($k48_k32, $k16_k0) = map("v$_", (24..25)); -+ -+my $code = ""; -+ -+# clmul64x64 emits code which emulates pmull $r.1q, $a.1d, $b.1d. $r, $a, and $b -+# must be distinct from $t* and $k*. $t* are clobbered by the emitted code. -+sub clmul64x64 { -+my ($r, $a, $b) = @_; -+$code .= <<___; -+ ext $t0.8b, $a.8b, $a.8b, #1 // A1 -+ pmull $t0.8h, $t0.8b, $b.8b // F = A1*B -+ ext $r.8b, $b.8b, $b.8b, #1 // B1 -+ pmull $r.8h, $a.8b, $r.8b // E = A*B1 -+ ext $t1.8b, $a.8b, $a.8b, #2 // A2 -+ pmull $t1.8h, $t1.8b, $b.8b // H = A2*B -+ ext $t3.8b, $b.8b, $b.8b, #2 // B2 -+ pmull $t3.8h, $a.8b, $t3.8b // G = A*B2 -+ ext $t2.8b, $a.8b, $a.8b, #3 // A3 -+ eor $t0.16b, $t0.16b, $r.16b // L = E + F -+ pmull $t2.8h, $t2.8b, $b.8b // J = A3*B -+ ext $r.8b, $b.8b, $b.8b, #3 // B3 -+ eor $t1.16b, $t1.16b, $t3.16b // M = G + H -+ pmull $r.8h, $a.8b, $r.8b // I = A*B3 -+ -+ // Here we diverge from the 32-bit version. It computes the following -+ // (instructions reordered for clarity): -+ // -+ // veor \$t0#lo, \$t0#lo, \$t0#hi @ t0 = P0 + P1 (L) -+ // vand \$t0#hi, \$t0#hi, \$k48 -+ // veor \$t0#lo, \$t0#lo, \$t0#hi -+ // -+ // veor \$t1#lo, \$t1#lo, \$t1#hi @ t1 = P2 + P3 (M) -+ // vand \$t1#hi, \$t1#hi, \$k32 -+ // veor \$t1#lo, \$t1#lo, \$t1#hi -+ // -+ // veor \$t2#lo, \$t2#lo, \$t2#hi @ t2 = P4 + P5 (N) -+ // vand \$t2#hi, \$t2#hi, \$k16 -+ // veor \$t2#lo, \$t2#lo, \$t2#hi -+ // -+ // veor \$t3#lo, \$t3#lo, \$t3#hi @ t3 = P6 + P7 (K) -+ // vmov.i64 \$t3#hi, #0 -+ // -+ // \$kN is a mask with the bottom N bits set. AArch64 cannot compute on -+ // upper halves of SIMD registers, so we must split each half into -+ // separate registers. To compensate, we pair computations up and -+ // parallelize. -+ -+ ext $t3.8b, $b.8b, $b.8b, #4 // B4 -+ eor $t2.16b, $t2.16b, $r.16b // N = I + J -+ pmull $t3.8h, $a.8b, $t3.8b // K = A*B4 -+ -+ // This can probably be scheduled more efficiently. For now, we just -+ // pair up independent instructions. -+ zip1 $t0l_t1l.2d, $t0.2d, $t1.2d -+ zip1 $t2l_t3l.2d, $t2.2d, $t3.2d -+ zip2 $t0h_t1h.2d, $t0.2d, $t1.2d -+ zip2 $t2h_t3h.2d, $t2.2d, $t3.2d -+ eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b -+ eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b -+ and $t0h_t1h.16b, $t0h_t1h.16b, $k48_k32.16b -+ and $t2h_t3h.16b, $t2h_t3h.16b, $k16_k0.16b -+ eor $t0l_t1l.16b, $t0l_t1l.16b, $t0h_t1h.16b -+ eor $t2l_t3l.16b, $t2l_t3l.16b, $t2h_t3h.16b -+ zip1 $t0.2d, $t0l_t1l.2d, $t0h_t1h.2d -+ zip1 $t2.2d, $t2l_t3l.2d, $t2h_t3h.2d -+ zip2 $t1.2d, $t0l_t1l.2d, $t0h_t1h.2d -+ zip2 $t3.2d, $t2l_t3l.2d, $t2h_t3h.2d -+ -+ ext $t0.16b, $t0.16b, $t0.16b, #15 // t0 = t0 << 8 -+ ext $t1.16b, $t1.16b, $t1.16b, #14 // t1 = t1 << 16 -+ pmull $r.8h, $a.8b, $b.8b // D = A*B -+ ext $t3.16b, $t3.16b, $t3.16b, #12 // t3 = t3 << 32 -+ ext $t2.16b, $t2.16b, $t2.16b, #13 // t2 = t2 << 24 -+ eor $t0.16b, $t0.16b, $t1.16b -+ eor $t2.16b, $t2.16b, $t3.16b -+ eor $r.16b, $r.16b, $t0.16b -+ eor $r.16b, $r.16b, $t2.16b -+___ -+} -+ -+$code .= <<___; -+#include -+ -+.text -+ -+.global GFp_gcm_init_neon -+.type GFp_gcm_init_neon,%function -+.align 4 -+GFp_gcm_init_neon: -+ AARCH64_VALID_CALL_TARGET -+ // This function is adapted from gcm_init_v8. xC2 is t3. -+ ld1 {$t1.2d}, [x1] // load H -+ movi $t3.16b, #0xe1 -+ shl $t3.2d, $t3.2d, #57 // 0xc2.0 -+ ext $INlo.16b, $t1.16b, $t1.16b, #8 -+ ushr $t2.2d, $t3.2d, #63 -+ dup $t1.4s, $t1.s[1] -+ ext $t0.16b, $t2.16b, $t3.16b, #8 // t0=0xc2....01 -+ ushr $t2.2d, $INlo.2d, #63 -+ sshr $t1.4s, $t1.4s, #31 // broadcast carry bit -+ and $t2.16b, $t2.16b, $t0.16b -+ shl $INlo.2d, $INlo.2d, #1 -+ ext $t2.16b, $t2.16b, $t2.16b, #8 -+ and $t0.16b, $t0.16b, $t1.16b -+ orr $INlo.16b, $INlo.16b, $t2.16b // H<<<=1 -+ eor $Hlo.16b, $INlo.16b, $t0.16b // twisted H -+ st1 {$Hlo.2d}, [x0] // store Htable[0] -+ ret -+.size GFp_gcm_init_neon,.-GFp_gcm_init_neon -+ -+.global GFp_gcm_gmult_neon -+.type GFp_gcm_gmult_neon,%function -+.align 4 -+GFp_gcm_gmult_neon: -+ AARCH64_VALID_CALL_TARGET -+ ld1 {$INlo.16b}, [$Xi] // load Xi -+ ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H -+ ld1 {$Hhi.1d}, [$Htbl] -+ adrp x9, :pg_hi21:.Lmasks // load constants -+ add x9, x9, :lo12:.Lmasks -+ ld1 {$k48_k32.2d, $k16_k0.2d}, [x9] -+ rev64 $INlo.16b, $INlo.16b // byteswap Xi -+ ext $INlo.16b, $INlo.16b, $INlo.16b, #8 -+ eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing -+ -+ mov $len, #16 -+ b .Lgmult_neon -+.size GFp_gcm_gmult_neon,.-GFp_gcm_gmult_neon -+ -+.global GFp_gcm_ghash_neon -+.type GFp_gcm_ghash_neon,%function -+.align 4 -+GFp_gcm_ghash_neon: -+ AARCH64_VALID_CALL_TARGET -+ ld1 {$Xl.16b}, [$Xi] // load Xi -+ ld1 {$Hlo.1d}, [$Htbl], #8 // load twisted H -+ ld1 {$Hhi.1d}, [$Htbl] -+ adrp x9, :pg_hi21:.Lmasks // load constants -+ add x9, x9, :lo12:.Lmasks -+ ld1 {$k48_k32.2d, $k16_k0.2d}, [x9] -+ rev64 $Xl.16b, $Xl.16b // byteswap Xi -+ ext $Xl.16b, $Xl.16b, $Xl.16b, #8 -+ eor $Hhl.8b, $Hlo.8b, $Hhi.8b // Karatsuba pre-processing -+ -+.Loop_neon: -+ ld1 {$INlo.16b}, [$inp], #16 // load inp -+ rev64 $INlo.16b, $INlo.16b // byteswap inp -+ ext $INlo.16b, $INlo.16b, $INlo.16b, #8 -+ eor $INlo.16b, $INlo.16b, $Xl.16b // inp ^= Xi -+ -+.Lgmult_neon: -+ // Split the input into $INlo and $INhi. (The upper halves are unused, -+ // so it is okay to leave them alone.) -+ ins $INhi.d[0], $INlo.d[1] -+___ -+&clmul64x64 ($Xl, $Hlo, $INlo); # H.lo·Xi.lo -+$code .= <<___; -+ eor $INlo.8b, $INlo.8b, $INhi.8b // Karatsuba pre-processing -+___ -+&clmul64x64 ($Xm, $Hhl, $INlo); # (H.lo+H.hi)·(Xi.lo+Xi.hi) -+&clmul64x64 ($Xh, $Hhi, $INhi); # H.hi·Xi.hi -+$code .= <<___; -+ ext $t0.16b, $Xl.16b, $Xh.16b, #8 -+ eor $Xm.16b, $Xm.16b, $Xl.16b // Karatsuba post-processing -+ eor $Xm.16b, $Xm.16b, $Xh.16b -+ eor $Xm.16b, $Xm.16b, $t0.16b // Xm overlaps Xh.lo and Xl.hi -+ ins $Xl.d[1], $Xm.d[0] // Xh|Xl - 256-bit result -+ // This is a no-op due to the ins instruction below. -+ // ins $Xh.d[0], $Xm.d[1] -+ -+ // equivalent of reduction_avx from ghash-x86_64.pl -+ shl $t1.2d, $Xl.2d, #57 // 1st phase -+ shl $t2.2d, $Xl.2d, #62 -+ eor $t2.16b, $t2.16b, $t1.16b // -+ shl $t1.2d, $Xl.2d, #63 -+ eor $t2.16b, $t2.16b, $t1.16b // -+ // Note Xm contains {Xl.d[1], Xh.d[0]}. -+ eor $t2.16b, $t2.16b, $Xm.16b -+ ins $Xl.d[1], $t2.d[0] // Xl.d[1] ^= t2.d[0] -+ ins $Xh.d[0], $t2.d[1] // Xh.d[0] ^= t2.d[1] -+ -+ ushr $t2.2d, $Xl.2d, #1 // 2nd phase -+ eor $Xh.16b, $Xh.16b,$Xl.16b -+ eor $Xl.16b, $Xl.16b,$t2.16b // -+ ushr $t2.2d, $t2.2d, #6 -+ ushr $Xl.2d, $Xl.2d, #1 // -+ eor $Xl.16b, $Xl.16b, $Xh.16b // -+ eor $Xl.16b, $Xl.16b, $t2.16b // -+ -+ subs $len, $len, #16 -+ bne .Loop_neon -+ -+ rev64 $Xl.16b, $Xl.16b // byteswap Xi and write -+ ext $Xl.16b, $Xl.16b, $Xl.16b, #8 -+ st1 {$Xl.16b}, [$Xi] -+ -+ ret -+.size GFp_gcm_ghash_neon,.-GFp_gcm_ghash_neon -+ -+.section .rodata -+.align 4 -+.Lmasks: -+.quad 0x0000ffffffffffff // k48 -+.quad 0x00000000ffffffff // k32 -+.quad 0x000000000000ffff // k16 -+.quad 0x0000000000000000 // k0 -+.asciz "GHASH for ARMv8, derived from ARMv4 version by " -+.align 2 -+___ -+ -+foreach (split("\n",$code)) { -+ s/\`([^\`]*)\`/eval $1/geo; -+ -+ print $_,"\n"; -+} -+close STDOUT or die "error closing STDOUT"; # enforce flush --- -Efraim Flashner רנשלפ םירפא -GPG key = A28B F40C 3E55 1372 662D 14F7 41AA E7DC CA3D 8351 -Confidentiality cannot be guaranteed on emails sent or received unencrypted - diff --git a/gnu/packages/patches/rust-ring-0.16-test-files.patch b/gnu/packages/patches/rust-ring-0.16-test-files.patch deleted file mode 100644 index dbe5c0f4ee..0000000000 --- a/gnu/packages/patches/rust-ring-0.16-test-files.patch +++ /dev/null @@ -1,54 +0,0 @@ -This file exists in the upstream repository at the commit which -corresponds to the ring-0.16.20 release, but was excluded from the -release tarball. - ---- - tests/ed25519_verify_tests.txt | 34 ++++++++++++++++++++++++++++++++++ - 1 file changed, 34 insertions(+) - create mode 100644 tests/ed25519_verify_tests.txt - -diff --git a/tests/ed25519_verify_tests.txt b/tests/ed25519_verify_tests.txt -new file mode 100644 -index 0000000..74c94b3 ---- /dev/null -+++ b/tests/ed25519_verify_tests.txt -@@ -0,0 +1,34 @@ -+# BoringSSL TEST(Ed25519Test Malleability) -+ -+# Control; S is in range. -+MESSAGE = 54657374 -+SIG = 7c38e026f29e14aabd059a0f2db8b0cd783040609a8be684db12f82a27774ab07a9155711ecfaf7f99f277bad0c6ae7e39d4eef676573336a5c51eb6f946b30d -+PUB = 7d4d0e7f6153a69b6242b522abbee685fda4420f8834b108c3bdae369ef549fa -+Result = P -+ -+# Same as above, but with the order L added to S so it is out of range. -+# BoringSSL commit 472ba2c2dd52d06a657a63b7fbf02732a6649d21 -+MESSAGE = 54657374 -+SIG = 7c38e026f29e14aabd059a0f2db8b0cd783040609a8be684db12f82a27774ab067654bce3832c2d76f8f6f5dafc08d9339d4eef676573336a5c51eb6f946b31d -+PUB = 7d4d0e7f6153a69b6242b522abbee685fda4420f8834b108c3bdae369ef549fa -+Result = F -+ -+ -+# BoringSSL commit 3094902fcdc2db2cc832fa854b9a6a8be383926c -+MESSAGE = 124e583f8b8eca58bb29c271b41d36986bbc45541f8e51f9cb0133eca447601e -+SIG = dac119d6ca87fc59ae611c157048f4d4fc932a149dbe20ec6effd1436abf83ea05c7df0fef06147241259113909bc71bd3c53ba4464ffcad3c0968f2ffffff0f -+PUB = 100fdf47fb94f1536a4f7c3fda27383fa03375a8f527c537e6f1703c47f94f86 -+Result = P -+ -+# Control. Same key as above; same message and signature as below, except S is in range. -+PUB = 100fdf47fb94f1536a4f7c3fda27383fa03375a8f527c537e6f1703c47f94f86 -+MESSAGE = 6a0bc2b0057cedfc0fa2e3f7f7d39279b30f454a69dfd1117c758d86b19d85e0 -+SIG = 0971f86d2c9c78582524a103cb9cf949522ae528f8054dc20107d999be673ff4f58ac9d20ec563133cabc6230b1db8625f8446639ede46ad4df4053000000000 -+Result = P -+ -+# Same key as above, but S is out of range. -+# BoringSSL commit 472ba2c2dd52d06a657a63b7fbf02732a6649d21 -+PUB = 100fdf47fb94f1536a4f7c3fda27383fa03375a8f527c537e6f1703c47f94f86 -+MESSAGE = 6a0bc2b0057cedfc0fa2e3f7f7d39279b30f454a69dfd1117c758d86b19d85e0 -+SIG = 0971f86d2c9c78582524a103cb9cf949522ae528f8054dc20107d999be673ff4e25ebf2f2928766b1248bec6e91697775f8446639ede46ad4df4053000000010 -+Result = F --- -Efraim Flashner רנשלפ םירפא -GPG key = A28B F40C 3E55 1372 662D 14F7 41AA E7DC CA3D 8351 -Confidentiality cannot be guaranteed on emails sent or received unencrypted - diff --git a/gnu/packages/rust-apps.scm b/gnu/packages/rust-apps.scm index 77e4f24f50..0675fee658 100644 --- a/gnu/packages/rust-apps.scm +++ b/gnu/packages/rust-apps.scm @@ -114,8 +114,6 @@ ("rust-tokio" ,rust-tokio-1) ("rust-tokio-rustls" ,rust-tokio-rustls-0.22) ("rust-url" ,rust-url-2)))) - (native-inputs - (list perl)) (home-page "https://github.com/mbrubeck/agate") (synopsis "Very simple server for the Gemini hypertext protocol") (description @@ -192,7 +190,7 @@ low-end hardware and serving many concurrent requests.") (("rust-serde-bytes" ,rust-serde-bytes-0.11) ("rust-serde-derive" ,rust-serde-derive-1)))) (native-inputs - (list perl pkg-config)) + (list pkg-config)) (inputs (list at-spi2-core gtk @@ -1193,8 +1191,7 @@ on the terminal in a visually appealing way.") (list python-tomli)) (inputs (list bzip2)) (native-inputs - (list perl - python-wheel + (list python-wheel python-wrapper python-setuptools-rust)) (home-page "https://github.com/pyo3/maturin") @@ -1504,7 +1501,7 @@ browsers.") (substitute* "Cargo.toml" ((".*\"vendored-libgit2\".*") ""))))))) (native-inputs - (list perl pkg-config)) + (list pkg-config)) (inputs (list libgit2-1.4 libssh2 @@ -1695,8 +1692,6 @@ rebase.") (("rust-boxxy" ,rust-boxxy-0.12)))) (inputs (list libpcap libseccomp)) - (native-inputs - (list perl)) (home-page "https://github.com/kpcyrd/sniffglue") (synopsis "Secure multithreaded packet sniffer") (description @@ -2366,7 +2361,7 @@ consecutive lines and since program start.") ;"dbus_mpris" ; Conflicts with rust-chrono-0.4 version. "pulseaudio_backend" "rodio_backend"))) - (native-inputs (list perl pkg-config)) + (native-inputs (list pkg-config)) (inputs (list alsa-lib dbus pulseaudio)) (home-page "https://github.com/Spotifyd/spotifyd") (synopsis "Spotify streaming daemon with Spotify Connect support") @@ -2543,7 +2538,7 @@ daemon which executes them.") ("rust-predicates" ,rust-predicates-2) ("rust-tempfile" ,rust-tempfile-3)))) (native-inputs - (list perl pkg-config)) + (list pkg-config)) (inputs (list openssl)) (home-page "https://github.com/dbrgn/tealdeer/") -- cgit 1.4.1