Synthesizing optimal 8051 code

Tags:

While working on an application targeting Nordic nRF24LE1, a wireless SoC with a fairly slow 8051 core, I was wondering if I can have fast, or at least not unusably slow, cryptography. Most cryptographic algorithms involve wide rotates, and the 8051 only has instructions for rotating a 8-bit accumulator by one bit at a time. In this note I explore deriving optimal code for rotating values in registers (that may be bigger than 8 bits) by multiple bits.

Introduction

My chosen approach (thanks to John Regehr for the suggestion) is to implement an interpreter for an abstract 8051 assembly representation in Racket and then use Rosette to translate assertions about the results of interpreting an arbitrary piece of code into a query for an SMT solver.

Rosette greatly simplifies this task because it lets me avoid learning anything about SMT solvers, and only requires me to understand the constraints of its symbolic execution approach. (Only a small subset of Racket is safe to use in Rosette, and functions outside of that subset are hard to use correctly without an in-depth understaning of how Rosette works.)

Code generator

The following code generates all possible optimal (more on that below) 8-bit and 16-bit rotates. It uses a rather hacky and complicated scheme where it runs several solvers in parallel, one per CPU, each aiming for a particular fixed number of instructions, and then picks the smallest result as the solvers finish. This is because at the time of writing it, I did not understand that Rosette allows optimizing exists-forall problems. (It is quite easy to do so, as I describe in a later note.)

However, that turned out to be a blessing in disguise; when writing this note, I rewrote the query as an optimization problem for the solver, and it doesn’t seem like that would work for this use case. First, of the solvers that can be used by Rosette, only Z3 supports quantified formulas, whereas Boolector had the best performance with the simpler queries. Second, even for very small programs (such as 8-bit rotates, which all fit in 4 instructions, and even restricting the usable registers to 2 out of 8), the memory footprint of Z3 grows extremely quickly, and I always ran out of memory before getting a solution.

By “optimal” here I mean “optimal within the limited model being used”, of course. The model I’m using specifically omits any memory access (preventing the use of the XCHD instruction among other things), and in general has a very limited number of instructions to make solver runtime manageable. It is possible (but unlikely) that some of the instructions missing in the model but present in every 8051 CPU provide a faster way to do rotates. It is possible (and fairly likely) that your specific flavor of 8051 CPU provides a faster way to do rotates that involves memory-mapped I/O; indeed, nRF24LE1 does, but I was interested in more portable code.

synth51.rkt (download)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
#lang rosette/safe

(require (only-in racket hash in-range for for/list with-handlers flush-output
                  thread thread-wait break-thread exn:break?
                  make-semaphore semaphore-wait semaphore-post call-with-semaphore/enable-break
                  processor-count))
(require rosette/solver/smt/z3
         rosette/solver/smt/boolector
         rosette/solver/smt/yices)
;(current-solver (z3 #:logic 'QF_BV #:options (hash
;   ':parallel.enable 'true
;   ':parallel.threads.max 4)))
;(current-solver (yices #:logic 'QF_BV))
(current-solver (boolector #:logic 'QF_BV))

(require rosette/lib/angelic
         rosette/lib/match)
(current-bitwidth 5)

; bit operations
(define (rotate-right s i x)
  (cond
    [(= i 0) x]
    [else (concat (extract (- i 1) 0 x) (extract (- s 1) i x))]))
(define (rotate-left s i x)
  (cond
    [(= i 0) x]
    [else (concat (extract (- s i 1) 0 x) (extract (- s 1) (- s i) x))]))

(define (replace-bit s i x y)
  (define m (bvshl (bv 1 s) (integer->bitvector i s)))
  (cond
    [(bveq y (bv 0 1)) (bvand x (bvnot m))]
    [(bveq y (bv 1 1)) (bvor x m)]
    [else (assert #f)]))

; CPU state
(struct state (A C Rn) #:mutable #:transparent)

(define (state-Rn-ref S n)
  (vector-ref (state-Rn S) n))
(define (state-Rn-set! S n v)
  (vector-set! (state-Rn S) n v))
(define (state-R0 S) (state-Rn-ref S 0))
(define (state-R1 S) (state-Rn-ref S 1))
(define (state-R2 S) (state-Rn-ref S 2))
(define (state-R3 S) (state-Rn-ref S 3))
(define (state-R4 S) (state-Rn-ref S 4))
(define (state-R5 S) (state-Rn-ref S 5))
(define (state-R6 S) (state-Rn-ref S 6))
(define (state-R7 S) (state-Rn-ref S 7))

(define-symbolic A R0 R1 R2 R3 R4 R5 R6 R7 (bitvector 8))
(define-symbolic C (bitvector 1))
(define (make-state)
  (state A C (vector R0 R1 R2 R3 R4 R5 R6 R7)))

; instructions
(struct MOV-A-Rn (n) #:transparent)
(struct MOV-Rn-A (n) #:transparent)
(struct ANL-A-Rn (n) #:transparent)
(struct ORL-A-Rn (n) #:transparent)
(struct XRL-A-Rn (n) #:transparent)
(struct XCH-A-Rn (n) #:transparent)
(struct MOV-A-i (i) #:transparent)
(struct ANL-A-i (i) #:transparent)
(struct ORL-A-i (i) #:transparent)
(struct SWAP-A () #:transparent)
(struct CLR-C () #:transparent)
(struct MOV-C-An (n) #:transparent)
(struct MOV-An-C (n) #:transparent)
(struct RLC-A () #:transparent)
(struct RRC-A () #:transparent)
(struct RL-A () #:transparent)
(struct RR-A () #:transparent)

(define (print-insn insn)
  (match insn
    [(MOV-A-Rn n) (printf "MOV A, R~s~n" n)]
    [(MOV-Rn-A n) (printf "MOV R~s, A~n" n)]
    [(ANL-A-Rn n) (printf "ANL A, R~s~n" n)]
    [(ORL-A-Rn n) (printf "ORL A, R~s~n" n)]
    [(XRL-A-Rn n) (printf "XRL A, R~s~n" n)]
    [(XCH-A-Rn n) (printf "XCH A, R~s~n" n)]
    [(MOV-A-i i)  (printf "MOV A, #0x~x~n" (bitvector->natural i))]
    [(ANL-A-i i)  (printf "ANL A, #0x~x~n" (bitvector->natural i))]
    [(ORL-A-i i)  (printf "ORL A, #0x~x~n" (bitvector->natural i))]
    [(SWAP-A)     (printf "SWAP A~n")]
    [(CLR-C)      (printf "CLR C~n")]
    [(MOV-C-An n) (printf "MOV C, ACC.~s~n" n)]
    [(MOV-An-C n) (printf "MOV ACC.~s, C~n" n)]
    [(RLC-A)      (printf "RLC A~n")]
    [(RRC-A)      (printf "RRC A~n")]
    [(RL-A)       (printf "RL A~n")]
    [(RR-A)       (printf "RR A~n")]))

; sketches
(define (??insn)
  (define n (choose* 0 1)); 2 3 4 5 6 7))
  (define-symbolic* i (bitvector 8))
  ;(define i (choose* (bv #xf0 8) (bv #x0f 8)))
  (choose* (MOV-A-Rn n)
           (MOV-Rn-A n)
           (ANL-A-Rn n)
           (ORL-A-Rn n)
           (XRL-A-Rn n)
           (XCH-A-Rn n)
           (MOV-A-i i)
           (ANL-A-i i)
           (ORL-A-i i)
           (SWAP-A)
           (CLR-C)
           (MOV-C-An n)
           (MOV-An-C n)
           (RLC-A)
           (RRC-A)
           (RL-A)
           (RR-A)))

(define (??prog fuel)
  (if (= fuel 0) null
      (cons (??insn) (??prog (- fuel 1)))))

; symbolic interpreter
(define (run-insn S insn)
  (match insn
    [(MOV-A-Rn n)
     (set-state-A! S (state-Rn-ref S n))]
    [(MOV-Rn-A n)
     (state-Rn-set! S n (state-A S))]
    [(ANL-A-Rn n)
     (set-state-A! S (bvand (state-A S) (state-Rn-ref S n)))]
    [(ORL-A-Rn n)
     (set-state-A! S (bvor  (state-A S) (state-Rn-ref S n)))]
    [(XRL-A-Rn n)
     (set-state-A! S (bvxor (state-A S) (state-Rn-ref S n)))]
    [(XCH-A-Rn n)
     (let ([A (state-A S)] [Rn (state-Rn-ref S n)])
       (set-state-A! S Rn) (state-Rn-set! S n A))]
    [(MOV-A-i i)
     (set-state-A! S i)]
    [(ANL-A-i i)
     (set-state-A! S (bvand (state-A S) i))]
    [(ORL-A-i i)
     (set-state-A! S (bvor  (state-A S) i))]
    [(SWAP-A)
     (let ([A (state-A S)])
       (set-state-A! S (concat (extract 3 0 A) (extract 7 4 A))))]
    [(CLR-C)
     (set-state-C! S (bv 0 1))]
    [(MOV-C-An n)
     (set-state-C! S (extract n n (state-A S)))]
    [(MOV-An-C n)
     (set-state-A! S (replace-bit 8 n (state-A S) (state-C S)))]
    [(RLC-A)
     (let ([A (state-A S)] [C (state-C S)])
       (set-state-A! S (concat (extract 6 0 A) C))
       (set-state-C! S (extract 7 7 A)))]
    [(RRC-A)
     (let ([A (state-A S)] [C (state-C S)])
       (set-state-A! S (concat C (extract 7 1 A)))
       (set-state-C! S (extract 0 0 A)))]
    [(RL-A)
     (let ([A (state-A S)])
       (set-state-A! S (concat (extract 6 0 A) (extract 7 7 A))))]
    [(RR-A)
     (let ([A (state-A S)])
       (set-state-A! S (concat (extract 0 0 A) (extract 7 1 A))))]
    ))

; program verifier
(define (verify-prog prog asserts)
  (define S  (make-state))
  (define S* (make-state))
  (define solution
    (verify
     #:guarantee
     (begin
       (for-each (curry run-insn S*) prog)
       (asserts S S*))))
  (if (unsat? solution) #t
      (begin
        (displayln (evaluate S  solution))
        (displayln (evaluate S* solution))
        #f)))

; program synthesizer
(define (synthesize-prog sketch asserts)
  (define S  (make-state))
  (define S* (make-state))
  (define solution
    (synthesize
     #:forall S
     #:guarantee
     (begin
       (for-each (lambda (insn) (run-insn S* insn)) sketch)
       (asserts S S*))))
  (if (unsat? solution) #f
      (evaluate sketch solution)))

(define (optimize-prog max-fuel sketch-gen asserts)
  (define (worker fuel)
    (define prog (synthesize-prog (sketch-gen fuel) asserts))
    (if prog
        (begin
          (eprintf "sat! ~s~n" fuel)
          (for-each print-insn prog))
        (begin
          (eprintf "unsat! ~s~n" fuel)
          (if (>= fuel max-fuel) #f
              (worker (+ fuel 1))))))
  (worker 0))

(define (optimize-prog/parallel max-fuel sketch-gen asserts)
  (define solved (box #f))
  (define solved-fuel (box 1000))
  (define threads (box '()))
  (define report-sema (make-semaphore 1))
  (define (worker fuel)
    (cond
      [(or (not (unbox solved)) (< fuel (unbox solved-fuel)))
       (define prog (synthesize-prog (sketch-gen fuel) asserts))
       (call-with-semaphore/enable-break report-sema
        (lambda ()
          (if prog
              (begin
                (eprintf "sat! ~s~n" fuel)
                (for-each (lambda (thd-fuel)
                            (if (> (cdr thd-fuel) fuel)
                                (break-thread (car thd-fuel))
                                (void))) (unbox threads))
                (if (or (not (unbox solved)) (< fuel (unbox solved-fuel)))
                    (begin
                      (set-box! solved-fuel fuel)
                      (set-box! solved prog))
                    (void)))
              (eprintf "unsat! ~s~n" fuel))))]))
  (define core-sema (make-semaphore (processor-count)))
  (for ([fuel (in-range (add1 max-fuel))])
    (semaphore-wait core-sema)
    (define thd
      (thread (lambda ()
                (with-handlers ([exn:break? (lambda (x) (void))])
                  (worker fuel))
                (semaphore-post core-sema))))
    (set-box! threads (cons (cons thd fuel) (unbox threads))))
  (for-each (lambda (thd-fuel) (thread-wait (car thd-fuel))) (unbox threads))
  (if (not (unbox solved)) (void)
      (begin
        (for-each print-insn (unbox solved))
        (flush-output))))

(define (assert-preserve S S* . regs)
  (define (assert-preserve-reg n)
    (assert (bveq (state-Rn-ref S n) (state-Rn-ref S* n))))
  (for-each assert-preserve-reg regs))

; examples
(define (optimize-8b-rotate-right n)
  (optimize-prog/parallel
   4
   (lambda (fuel) (??prog fuel))
   (lambda (S S*)
     (assert (bveq (rotate-right 8 n (state-R0 S)) (state-R0 S*)))
     (assert-preserve S S* 1 2 3 4 5 6 7))))
(for ([n (in-range 8)])
 (time
  (printf "; rotate right R0 by ~a~n" n) (flush-output)
  (optimize-8b-rotate-right n)
  (printf "; ")))

(define (optimize-16b-rotate-right n)
  (optimize-prog/parallel
   20
   (lambda (fuel)
    (if (= fuel 0) '()
      (append
       ; help the synthesizer out a bit
       (list (MOV-A-Rn (choose* 0 1)))
       (??prog fuel)
       (list (MOV-Rn-A (choose* 0 1))))))
   (lambda (S S*)
     (define R10  (concat (state-R1 S ) (state-R0 S )))
     (define R10* (concat (state-R1 S*) (state-R0 S*)))
     (assert (bveq (rotate-right 16 n R10) R10*))
     (assert-preserve S S* 2 3 4 5 6 7))))
(for ([n (in-range 16)])
 (time
  (printf "; rotate right R1:R0 by ~a~n" n) (flush-output)
  (optimize-16b-rotate-right n)
  (printf "; ")))

Results

Generating the optimal 8-bit and 16-bit rotates took about half a day on a modern laptop (Dell XPS13 9360, using all cores and with mitigations disabled). Because of that I have not attempted generating wider ones so far.

8-bit rotates

The following code lists all optimal 8-bit rotates, by 0 to 7 bits.

rot8.asm (download)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
; rotate right R0 by 0
; rotate right R0 by 1
MOV A, R0
RR A
MOV R0, A
; rotate right R0 by 2
MOV A, R0
RR A
RR A
MOV R0, A
; rotate right R0 by 3
XCH A, R0
RL A
SWAP A
MOV R0, A
; rotate right R0 by 4
MOV A, R0
SWAP A
MOV R0, A
; rotate right R0 by 5
MOV A, R0
SWAP A
RR A
XCH A, R0
; rotate right R0 by 6
MOV A, R0
RL A
RL A
MOV R0, A
; rotate right R0 by 7
MOV A, R0
RL A
MOV R0, A

16-bit rotates

The following code lists all optimal 16-bit rotates, by 0 to 15 bits. I find the approach the solver used for the rotate by 10 nothing short of brilliant, and the approach it took for rotate by 3/5/11/13 pretty neat as well.

rot16.asm (download)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
; rotate right R1:R0 by 0
; rotate right R1:R0 by 1
MOV A, R1
MOV C, ACC.0
XCH A, R0
RRC A
XCH A, R0
RRC A
MOV R1, A
; rotate right R1:R0 by 2
MOV A, R1
MOV C, ACC.0
XCH A, R0
RRC A
XCH A, R0
RRC A
MOV C, ACC.0
XCH A, R0
RRC A
XCH A, R0
RRC A
MOV R1, A
; rotate right R1:R0 by 3
MOV A, R0
XRL A, R1
MOV R1, A
ANL A, #0xf8
XRL A, R0
SWAP A
RL A
XCH A, R1
SWAP A
RL A
XRL A, R1
MOV R0, A
; rotate right R1:R0 by 4
MOV A, R0
XRL A, R1
ANL A, #0xf0
XCH A, R1
XRL A, R1
SWAP A
XCH A, R0
XRL A, R1
SWAP A
MOV R1, A
; rotate right R1:R0 by 5
MOV A, R1
XRL A, R0
ANL A, #0x1f
XCH A, R0
XRL A, R0
RR A
SWAP A
XCH A, R0
XRL A, R1
SWAP A
RR A
MOV R1, A
; rotate right R1:R0 by 6
MOV A, R1
RLC A
XCH A, R0
RLC A
XCH A, R1
RLC A
MOV C, ACC.7
XCH A, R1
RLC A
XCH A, R1
RLC A
MOV R0, A
; rotate right R1:R0 by 7
MOV A, R0
MOV C, ACC.7
MOV A, R1
RLC A
XCH A, R0
RLC A
MOV R1, A
; rotate right R1:R0 by 8
MOV A, R1
XCH A, R0
MOV R1, A
; rotate right R1:R0 by 9
MOV A, R1
RRC A
MOV A, R0
RRC A
XCH A, R1
RRC A
MOV R0, A
; rotate right R1:R0 by 10
MOV A, R1
XRL A, R0
ANL A, #0x3
XRL A, R1
RR A
RR A
XCH A, R0
XRL A, R1
RR A
RR A
XRL A, R0
MOV R1, A
; rotate right R1:R0 by 11
MOV A, R1
XRL A, R0
MOV R1, A
ANL A, #0x7
XRL A, R0
RL A
SWAP A
XCH A, R1
RL A
SWAP A
XRL A, R1
MOV R0, A
; rotate right R1:R0 by 12
MOV A, R0
XRL A, R1
ANL A, #0xf0
XCH A, R1
XRL A, R1
SWAP A
XCH A, R1
XRL A, R0
SWAP A
MOV R0, A
; rotate right R1:R0 by 13
MOV A, R1
XRL A, R0
ANL A, #0xe0
XCH A, R0
XRL A, R0
SWAP A
RR A
XCH A, R0
XRL A, R1
RR A
SWAP A
MOV R1, A
; rotate right R1:R0 by 14
MOV A, R1
MOV C, ACC.7
XCH A, R0
RLC A
XCH A, R0
RLC A
MOV C, ACC.7
XCH A, R0
RLC A
XCH A, R0
RLC A
MOV R1, A
; rotate right R1:R0 by 15
MOV A, R1
MOV C, ACC.7
XCH A, R0
RLC A
XCH A, R0
RLC A
MOV R1, A

Want to discuss this note? Drop me a letter.