On this page:
13.1 Array data
13.2 Meaning of Hoax programs, implicitly
13.3 Representing Hoax values
13.4 Representing and operating on vectors
13.5 Representing and operating on strings
13.6 A Compiler for Hoax
13.7 A Run-Time for Hoax
8.14

13 Hoax: vectors and strings🔗

image Source code.

Stupidity, outrage, vanity, cruelty, iniquity, bad faith, falsehood - we fail to see the whole array when it is facing in the same direction as we.

    13.1 Array data

    13.2 Meaning of Hoax programs, implicitly

    13.3 Representing Hoax values

    13.4 Representing and operating on vectors

    13.5 Representing and operating on strings

    13.6 A Compiler for Hoax

    13.7 A Run-Time for Hoax

13.1 Array data🔗

In the Hoax language, we will add two array data types: vectors and strings.

Vectors are fixed-length arrays of values with constant-time access and update of the vector slots, which are numbered from 0 to one less than the number of slots in the vector.

Strings are fixed-length arrays of characters with constant-time access and update of the character slots, which are numbered from 0 to one less than the number of slots in the string.

The new vector operations include the constructor (make-vector e1 e2), predicate (vector? e0), accessor (vector-ref e0 e1) and mutator (vector-set! e0 e1 e2).

These features will operate like their Racket counterparts:

Examples

> (make-vector 3 #t)

'#(#t #t #t)

> (vector? (make-vector 3 #t))

#t

> (vector-ref (make-vector 3 #t) 0)

#t

> (vector-ref (make-vector 3 #t) 2)

#t

> (let ((v (make-vector 3 #t)))
    (begin (vector-set! v 1 #f)
           v))

'#(#t #f #t)

The new string operations include the constructor (make-string e1 e2), predicate (string? e0), and accessor (string-ref e0 e1). We will also add support for string literals.

These features will operate like their Racket counterparts:

Examples

> (make-string 3 #\t)

"ttt"

> (string? (make-string 3 #\t))

#t

> (string-ref "abc" 0)

#\a

> (string-ref "abc" 2)

#\c

We can model this syntax as an AST data type:

hoax/ast.rkt

#lang racket
;; type Expr = ...
;;           | (Prim3 Op3 Expr Expr Expr)
;; type Op1 = ...
;;          | 'vector? | 'string?
;; type Op2 = ...
;;          | 'make-vector | 'vector-ref
;;          | 'make-string | 'string-ref
;; type Op3 = 'vector-set!
13.2 Meaning of Hoax programs, implicitly🔗

We extend our definition of values, representing vectors with vectors and strings with strings (what a surprise!):

; type Value =
; | Integer
; | Boolean
; | Character
; | Eof
; | Void
; | '()
; | (cons Value Value)
; | (box Value)
; | (vector Value ...)
; | (string Character ...)

The Hoax interpreter is essentially the same as for Hustle, although with the addition of ternary primitives, plus an extension of the interp-prim module:

hoax/interp-prim.rkt

  #lang racket
  (provide interp-prim0 interp-prim1 interp-prim2 interp-prim3)
   
  ;; Op0 -> Value
  (define (interp-prim0 op)
    (match op
      ['read-byte (read-byte)]
      ['peek-byte (peek-byte)]
      ['void      (void)]))
   
  ;; Op1 Value -> Answer
  (define (interp-prim1 op v)
    (match (list op v)
      [(list 'add1 (? integer?))            (add1 v)]
      [(list 'sub1 (? integer?))            (sub1 v)]
      [(list 'zero? (? integer?))           (zero? v)]
      [(list 'char? v)                      (char? v)]
      [(list 'integer->char (? codepoint?)) (integer->char v)]
      [(list 'char->integer (? char?))      (char->integer v)]
      [(list 'write-byte    (? byte?))      (write-byte v)]
      [(list 'eof-object? v)                (eof-object? v)]
      [(list 'box v)                        (box v)]
      [(list 'unbox (? box?))               (unbox v)]
      [(list 'car (? pair?))                (car v)]
      [(list 'cdr (? pair?))                (cdr v)]
      [(list 'empty? v)                     (empty? v)]
      [(list 'cons? v)                      (cons? v)]
      [(list 'box? v)                       (box? v)]
      [(list 'vector? v)                    (vector? v)]
      [(list 'vector-length (? vector?))    (vector-length v)]
      [(list 'string? v)                    (string? v)]
      [(list 'string-length (? string?))    (string-length v)]
      [_ 'err]))
   
  ;; Op2 Value Value -> Answer
  (define (interp-prim2 op v1 v2)
    (match (list op v1 v2)
      [(list '+ (? integer?) (? integer?)) (+ v1 v2)]
      [(list '- (? integer?) (? integer?)) (- v1 v2)]
      [(list '< (? integer?) (? integer?)) (< v1 v2)]
      [(list '= (? integer?) (? integer?)) (= v1 v2)]
      [(list 'eq? v1 v2)                    (eq? v1 v2)]
      [(list 'cons v1 v2)                   (cons v1 v2)]
      [(list 'make-vector (? integer?) _)
       (if (<= 0 v1)
           (make-vector v1 v2)
           'err)]
      [(list 'vector-ref (? vector?) (? integer?))
       (if (<= 0 v2 (sub1 (vector-length v1)))
           (vector-ref v1 v2)
           'err)]
      [(list 'make-string (? integer?) (? char?))
       (if (<= 0 v1)
           (make-string v1 v2)
           'err)]
      [(list 'string-ref (? string?) (? integer?))
       (if (<= 0 v2 (sub1 (string-length v1)))
           (string-ref v1 v2)
           'err)]
      [_ 'err]))
   
  ;; Op3 Value Value Value -> Answer
  (define (interp-prim3 p v1 v2 v3)
    (match (list p v1 v2 v3)
      [(list 'vector-set! (? vector?) (? integer?) _)
       (if (<= 0 v2 (sub1 (vector-length v1)))
           (vector-set! v1 v2 v3)
           'err)]
      [_ 'err]))
   
  ;; Any -> Boolean
  (define (codepoint? v)
    (and (integer? v)
         (or (<= 0 v 55295)
             (<= 57344 v 1114111))))
   
   

Vectors are easy to model in the interpreter because we can rely on vectors in the meta-level of Racket.

This of course doesn’t illuminate much about these operations. We could, as we did for Hustle, develop an interpeter with an explicit account of memory. Instead, let’s just jump into the details of the compiler.

13.3 Representing Hoax values🔗

Now that were are comfortable with heap-allocated data-structures like boxes and pairs, handling vectors is not too difficult. Vectors are similarly heap allocated. This will require a new kind of pointer value:

- values

  + pointers (non-zero in last 3 bits)

    * boxes

    * pairs

    * vectors

    * strings

  + immediates (zero in last three bits)

    * integers

    * characters

    * booleans

    * ...

We will follow exactly the same scheme we followed for box and pairs: vectors and strings will be uniquely tagged in the lowest three bits and the remaining bits will indicate an address in memory which can be obtained by zeroing out the tag bits.

13.4 Representing and operating on vectors🔗

The memory that is pointed to by a vector pointer will contain the size of the vector followed by that many words of memory, one for each element of the vector. (Strings will be similar, with a slight twist, which we’ll examine later.)

So for example the following creates a vector of size 3 containing the values 1, #t, #\c:

(seq (Mov (Offset 'rbx 0) 3)       ; write vector length 3
     (Mov 'rax (value->bits 1))
     (Mov (Offset 'rbx 8) 'rax)    ; write 1 in vector slot 0
     (Mov 'rax (value->bits #t))
     (Mov (Offset 'rbx 16) 'rax)   ; write #t in vector slot 1
     (Mov 'rax (value->bits #\c))
     (Mov (Offset 'rbx 24) 'rax)   ; write #\c in vector slot 2
     (Mov 'rax 'rbx)
     (Or 'rax type-vect)           ; tag pointer as a vector
     (Add 'rbx 32))                ; advance rbx four words

Notice that the value written at offset 0 is 3, not (value->bits 3). This is because this slot of memory in a vector can only hold an integer, not an arbitrary value, so there’s no need to encode the type into the value—it’s position tells us it’s an integer.

Now let’s consider referencing elements of a vector. If 'rax holds a vector value, we can reference an element of the vector by untagging the value and fetching from an appropriate offset. Suppose we want to fetch the 2nd element (i.e. index 1) of a vector in 'rax:

(seq (Xor 'rax type-vect)         ; erase the vector tag
     (Mov 'rax (Offset 'rax 16))) ; load index 1 into rax

Notice that the offset here is 16 because the first word is the length, so the second word is the first element, and the third word (offset 16) is the element we want.

This code assumes the vector has a length of at least two. In general, the vector operations must check that the given index is valid for the vector. This is accomplished by checking against the length stored in the first word of the vector’s memory. Using 'r9 as a scratch register, we could insert a check as follows:

(seq (Xor 'rax type-vect)         ; erase the vector tag
     (Mov 'r9 (Offset 'rax 0))    ; load length into r9
     (Cmp 'r9 2)                  ; see if len < 2,
     (Jl 'raise_error)            ; raise error if so, otherwise
     (Mov 'rax (Offset 'rax 16))) ; load index 1 into rax

Suppose 'rax holds a vector value and we want to update the 2nd element (i.e. index 1) to be #f. Following the outline above, we can erase the vector tag, check that the index is valid, and then, rather than loading the element from memory, we can write the new element at the appropriate offset:

(seq (Xor 'rax type-vect)         ; erase the vector tag
     (Mov 'r9 (Offset 'rax 0))    ; load length into r9
     (Cmp 'r9 2)                  ; see if len < 2,
     (Jl 'raise_error)            ; raise error if so, otherwise
     (Mov (Offset 'rax 16)
          (value->bits #f))) ;    ; write #f into index 1

One final issue for vectors is what to do about the empty vector.

An empty vector has length zero and there are no elements contained within it. We could reprent empty vectors the same as non-empty vectors, although this would mean allocating a word of memory to hold the length 0 and pointing to it. This design would also have the drawback that there could many different empty vectors.

Another approach is to avoid allocating memory and have a single representation for the empty vector. One way to achieve this to represent the address of the empty vector as the null pointer (i.e. 0) and therefore the empty vector value is represented by the vector type tag. Some code, such as the code to print vectors, will need to have a special case for the empty vector to avoid a null dereference when trying to load the length of the vector. Similarly, there will be a special case in the implementation of make-vector to produce the empty vector value when given a size of zero. This avoids allocating memory for the empty vector and has the nice benefit that there is a unique representation of the empty vector.

13.5 Representing and operating on strings🔗

Strings will be very much like vectors—after all, they are just another kind of array value. The key difference is that strings are arrays not of arbitrary values, but of characters.

While could use a vector to represent a string, with a unique pointer tag, this would waste memory: every character would be allocated 64-bits of memory. Since we use unicode codepoints to represent characters and because strings are homogenous we need at most 21-bits to represent each character of a string.

There are many different representations for strings of Unicode characters, but one of the simplest is that of UTF-32. It is a fixed-width encoding that uses 32-bits for each character. This is still wasteful, but has the benefit of supporting string-ref in constant time. Had we not needed to implement string-ref with this guarantee, other less wasteful encodings such as UTF-8 could be used. We’ll use UTF-32 as a nice balance of simplicity and economy of memory usage.

So the basic idea will be that a string will be represented by a distinct tag in the lower three bits of an 8-byte aligned address. The pointer points to memory where, like a vector, the first word holds the length of the string, followed by an array of 32-bit slots, each holding a character codepoint.

There is a wrinkle: an odd length string would seemingly occupy a segment of memory that does not fall on an 8-byte boundary. For example, a string of length 1 would occupy 64+32=96-bits=12-bytes of memory. This would violate our assumption that the next free memory address ends in #b000.

The solution is simple: allocate 32-bits more when the length is odd. This sacrafices a small amount of memory in order to preserve the invariant that allows our low-order tagging of pointers.

Another complication is that we will now want to read and write 32-bits of memory. Until now, we’ve always operated on memory in units of 64-bits. We could “fake it” by reading and writing 64-bits at a time, carefully making sure to ignore or preserve half of the bits, however this makes the code a mess and is inefficient.

The better solution is to introduce a 32-bit register: 'eax. The 'eax register is not actually a new register, but rather is a name for the lower 32-bits of 'rax (so be careful: modifying one will change the other—they are the same register!). Whenever 'eax is used in a memory read or write, the CPU will read or write 32-bits instead of 64.

So, suppose we want to create the string "abc":

(seq (Mov (Offset 'rbx 0) 3)       ; write string length 3
     (Mov 'eax (char->integer #\a))
     (Mov (Offset 'rbx 8) 'eax)    ; write #\a in string slot 0
     (Mov 'eax (char->integer #\b))
     (Mov (Offset 'rbx 12) 'eax)   ; write #\b in string slot 1
     (Mov 'eax (char->integer #\c))
     (Mov (Offset 'rbx 16) 'eax)   ; write #\c in string slot 2
     (Mov 'rax 'rbx)
     (Or 'rax type-str)            ; tag pointer as a string
     (Add 'rbx 24))                ; advance rbx three words(!)

This looks a lot like the creation of a vector, however note that we
  • use 'eax to write 32-bits of memory,

  • advance the offset by 4-bytes (32-bits) on each subsequent character,

  • write (char->integer #\a) instead of (value->bits #\a) into memory,

  • increment 'rbx by 24, even though we’ve only written 20 bytes.

Now let’s consider referencing elements of a string. Suppose 'rax holds a string value, we can reference an element of the string by untagging the value and fetching from an appropriate offset. This is just like referencing an element of a vector, except:
  • the offset will be computed differently,

  • only 32-bits should be loaded from memory, and

  • the codepoint needs to be converted into a character.

Suppose we want to fetch the 2nd element (i.e. index 1) of a string in 'rax:

(seq (Xor 'rax type-str)          ; erase the string tag
     (Mov 'eax (Offset 'rax 12))  ; load index 1 into eax
     (Sal 'rax char-shift)
     (Or 'rax char-type))         ; convert codepoint to character

Note the use of offset 12 here: 8-bytes to skip past the length plus 4 bytes to skip past the first character. The 'eax register is used to load 32-bits of memory, then the value is converted to a character by shifting and tagging.

Just as we did with vectors, we want the compiler to emit code that checks indices are inbound for the string.

13.6 A Compiler for Hoax🔗

Most of the work for the Hoax compiler is done in the compilation of the new operations:

hoax/compile-ops.rkt

  #lang racket
  (provide compile-op0 compile-op1 compile-op2 compile-op3 pad-stack)
  (require "ast.rkt")
  (require "types.rkt")
  (require "assert.rkt")
  (require a86/ast)
   
  (define rax 'rax)
  (define eax 'eax) ; 32-bit load/store
  (define rbx 'rbx) ; heap
  (define rdi 'rdi) ; arg
  (define r8  'r8)  ; scratch in op2
  (define r9  'r9)  ; scratch
  (define r10 'r10) ; scratch
   
  (define r15 'r15) ; stack pad (non-volatile)
  (define rsp 'rsp) ; stack
   
  ;; Op0 -> Asm
  (define (compile-op0 p)
    (match p
      ['void      (seq (Mov rax (value->bits (void))))]
      ['read-byte (seq pad-stack (Call 'read_byte) unpad-stack)]
      ['peek-byte (seq pad-stack (Call 'peek_byte) unpad-stack)]))
   
  ;; Op1 -> Asm
  (define (compile-op1 p)
    (match p
      ['add1
       (seq (assert-integer rax)
            (Add rax (value->bits 1)))]
      ['sub1
       (seq (assert-integer rax)
            (Sub rax (value->bits 1)))]
      ['zero?
       (seq (assert-integer rax)
            (Cmp rax 0)
            if-equal)]
      ['char?
       (seq (And rax mask-char)
            (Cmp rax type-char)
            if-equal)]
      ['char->integer
       (seq (assert-char rax)
            (Sar rax char-shift)
            (Sal rax int-shift))]
      ['integer->char
       (seq (assert-codepoint rax)
            (Sar rax int-shift)
            (Sal rax char-shift)
            (Xor rax type-char))]
      ['eof-object?
       (seq (Cmp rax (value->bits eof))
            if-equal)]
      ['write-byte
       (seq (assert-byte rax)
            pad-stack
            (Mov rdi rax)
            (Call 'write_byte)
            unpad-stack)]
      ['box
       (seq (Mov (Offset rbx 0) rax) ; memory write
            (Mov rax rbx)            ; put box in rax
            (Xor rax type-box)       ; tag as a box
            (Add rbx 8))]
      ['unbox
       (seq (assert-box rax)
            (Mov rax (Offset rax (- type-box))))]
      ['car
       (seq (assert-cons rax)
            (Mov rax (Offset rax (- 8 type-cons))))]
      ['cdr
       (seq (assert-cons rax)
            (Mov rax (Offset rax (- type-cons))))]
   
      ['empty? (seq (Cmp rax (value->bits '())) if-equal)]
      ['cons? (type-pred ptr-mask type-cons)]
      ['box?  (type-pred ptr-mask type-box)]
      ['vector? (type-pred ptr-mask type-vect)]
      ['string? (type-pred ptr-mask type-str)]
      ['vector-length
       (let ((zero (gensym))
             (done (gensym)))
         (seq (assert-vector rax)
              (Cmp rax type-vect)
              (Je zero)
              (Mov rax (Offset rax (- type-vect)))
              (Sal rax int-shift)
              (Jmp done)
              (Label zero)
              (Mov rax 0)
              (Label done)))]
      ['string-length
       (let ((zero (gensym))
             (done (gensym)))
         (seq (assert-string rax)
              (Cmp rax type-str)
              (Je zero)
              (Mov rax (Offset rax (- type-str)))
              (Sal rax int-shift)
              (Jmp done)
              (Label zero)
              (Mov rax 0)
              (Label done)))]))
   
   
  ;; Op2 -> Asm
  (define (compile-op2 p)
    (match p
      ['+
       (seq (Pop r8)
            (assert-integer r8)
            (assert-integer rax)
            (Add rax r8))]
      ['-
       (seq (Pop r8)
            (assert-integer r8)
            (assert-integer rax)
            (Sub r8 rax)
            (Mov rax r8))]
      ['<
       (seq (Pop r8)
            (assert-integer r8)
            (assert-integer rax)
            (Cmp r8 rax)
            if-lt)]
      ['=
       (seq (Pop r8)
            (assert-integer r8)
            (assert-integer rax)
            (Cmp r8 rax)
            if-equal)]
      ['cons
       (seq (Mov (Offset rbx 0) rax)
            (Pop rax)
            (Mov (Offset rbx 8) rax)
            (Mov rax rbx)
            (Xor rax type-cons)
            (Add rbx 16))]
      ['eq?
       (seq (Pop r8)
            (Cmp rax r8)
            if-equal)]
      ['make-vector ;; size value
       (let ((loop (gensym))
             (done (gensym))
             (empty (gensym)))
         (seq (Pop r8) ;; r8 = size
              (assert-natural r8)
              (Cmp r8 0) ; special case empty vector
              (Je empty)
   
              (Mov r9 rbx)
              (Xor r9 type-vect)
   
              (Sar r8 int-shift)
              (Mov (Offset rbx 0) r8)
              (Add rbx 8)
   
              (Label loop)
              (Mov (Offset rbx 0) rax)
              (Add rbx 8)
              (Sub r8 1)
              (Cmp r8 0)
              (Jne loop)
   
              (Mov rax r9)
              (Jmp done)
   
              (Label empty)
              (Mov rax type-vect)
              (Label done)))]
      ['vector-ref ; vector index
       (seq (Pop r8)
            (assert-vector r8)
            (assert-integer rax)
            (Cmp r8 type-vect)
            (Je 'err) ; special case for empty vector
            (Cmp rax 0)
            (Jl 'err)
            (Xor r8 type-vect)      ; r8 = ptr
            (Mov r9 (Offset r8 0))  ; r9 = len
            (Sar rax int-shift)     ; rax = index
            (Sub r9 1)
            (Cmp r9 rax)
            (Jl 'err)
            (Sal rax 3)
            (Add r8 rax)
            (Mov rax (Offset r8 8)))]
      ['make-string
       (let ((loop (gensym))
             (done (gensym))
             (empty (gensym)))
         (seq (Pop r8)
              (assert-natural r8)
              (assert-char rax)
              (Cmp r8 0) ; special case empty string
              (Je empty)
   
              (Mov r9 rbx)
              (Xor r9 type-str)
   
              (Sar r8 int-shift)
              (Mov (Offset rbx 0) r8)
              (Add rbx 8)
   
              (Sar rax char-shift)
   
              (Add r8 1) ; adds 1
              (Sar r8 1) ; when
              (Sal r8 1) ; len is odd
   
              (Label loop)
              (Mov (Offset rbx 0) eax)
              (Add rbx 4)
              (Sub r8 1)
              (Cmp r8 0)
              (Jne loop)
   
              (Mov rax r9)
              (Jmp done)
   
              (Label empty)
              (Mov rax type-str)
              (Label done)))]
      ['string-ref
       (seq (Pop r8)
            (assert-string r8)
            (assert-integer rax)
            (Cmp r8 type-str)
            (Je 'err) ; special case for empty string
            (Cmp rax 0)
            (Jl 'err)
            (Xor r8 type-str)       ; r8 = ptr
            (Mov r9 (Offset r8 0))  ; r9 = len
            (Sar rax int-shift)     ; rax = index
            (Sub r9 1)
            (Cmp r9 rax)
            (Jl 'err)
            (Sal rax 2)
            (Add r8 rax)
            (Mov 'eax (Offset r8 8))
            (Sal rax char-shift)
            (Xor rax type-char))]))
   
  ;; Op3 -> Asm
  (define (compile-op3 p)
    (match p
      ['vector-set!
       (seq (Pop r10)
            (Pop r8)
            (assert-vector r8)
            (assert-integer r10)
            (Cmp r8 type-vect)
            (Je 'err)
            (Cmp r10 0)
            (Jl 'err)
            (Xor r8 type-vect)       ; r8 = ptr
            (Mov r9 (Offset r8 0))   ; r9 = len
            (Sar r10 int-shift)      ; r10 = index
            (Sub r9 1)
            (Cmp r9 r10)
            (Jl 'err)
            (Sal r10 3)
            (Add r8 r10)
            (Mov (Offset r8 8) rax)
            (Mov rax (value->bits (void))))]))
   
  (define (type-pred mask type)
    (seq (And rax mask)
         (Cmp rax type)
         if-equal))
   
  ;; Asm
  ;; set rax to #t or #f if comparison flag is equal
  (define if-equal
    (seq (Mov rax (value->bits #f))
         (Mov r9  (value->bits #t))
         (Cmove rax r9)))
   
  ;; Asm
  ;; set rax to #t or #f if comparison flag is less than
  (define if-lt
    (seq (Mov rax (value->bits #f))
         (Mov r9  (value->bits #t))
         (Cmovl rax r9)))
   
   
  ;; Asm
  ;; Dynamically pad the stack to be aligned for a call
  (define pad-stack
    (seq (Mov r15 rsp)
         (And r15 #b1000)
         (Sub rsp r15)))
   
  ;; Asm
  ;; Undo the stack alignment after a call
  (define unpad-stack
    (seq (Add rsp r15)))
   
   

We can now confirm that the compiler generates code similar to what we wrote by hand above:

Examples

> (define (show e c)
    (compile-e (parse e) c))
> (show '(make-vector 3 #t) '())

(list

 (Mov 'rax 48)

 (Push 'rax)

 (Mov 'rax 24)

 (Pop 'r8)

 (Mov 'r9 'r8)

 (And 'r9 15)

 (Cmp 'r9 0)

 (Jne 'err)

 (Cmp 'r8 0)

 (Jl 'err)

 (Cmp 'r8 0)

 (Je 'g7205)

 (Mov 'r9 'rbx)

 (Xor 'r9 3)

 (Sar 'r8 4)

 (Mov (Offset '(+ rbx 0)) 'r8)

 (Add 'rbx 8)

 (Label 'g7203)

 (Mov (Offset '(+ rbx 0)) 'rax)

 (Add 'rbx 8)

 (Sub 'r8 1)

 (Cmp 'r8 0)

 (Jne 'g7203)

 (Mov 'rax 'r9)

 (Jmp 'g7204)

 (Label 'g7205)

 (Mov 'rax 3)

 (Label 'g7204))

> (show '(vector-ref x 1) '(x))

(list

 (Mov 'rax (Offset '(+ rsp 0)))

 (Push 'rax)

 (Mov 'rax 16)

 (Pop 'r8)

 (Mov 'r9 'r8)

 (And 'r9 7)

 (Cmp 'r9 3)

 (Jne 'err)

 (Mov 'r9 'rax)

 (And 'r9 15)

 (Cmp 'r9 0)

 (Jne 'err)

 (Cmp 'r8 3)

 (Je 'err)

 (Cmp 'rax 0)

 (Jl 'err)

 (Xor 'r8 3)

 (Mov 'r9 (Offset '(+ r8 0)))

 (Sar 'rax 4)

 (Sub 'r9 1)

 (Cmp 'r9 'rax)

 (Jl 'err)

 (Sal 'rax 3)

 (Add 'r8 'rax)

 (Mov 'rax (Offset '(+ r8 8))))

> (show '"abc" '())

(list

 (Mov 'rax 3)

 (Mov (Offset '(+ rbx 0)) 'rax)

 (Mov 'rax 97)

 (Mov (Offset '(+ rbx 8)) 'eax)

 (Mov 'rax 98)

 (Mov (Offset '(+ rbx 12)) 'eax)

 (Mov 'rax 99)

 (Mov (Offset '(+ rbx 16)) 'eax)

 (Mov 'rax 'rbx)

 (Xor 'rax 4)

 (Add 'rbx 24))

> (show '(string-ref x 1) '(x))

(list

 (Mov 'rax (Offset '(+ rsp 0)))

 (Push 'rax)

 (Mov 'rax 16)

 (Pop 'r8)

 (Mov 'r9 'r8)

 (And 'r9 7)

 (Cmp 'r9 4)

 (Jne 'err)

 (Mov 'r9 'rax)

 (And 'r9 15)

 (Cmp 'r9 0)

 (Jne 'err)

 (Cmp 'r8 4)

 (Je 'err)

 (Cmp 'rax 0)

 (Jl 'err)

 (Xor 'r8 4)

 (Mov 'r9 (Offset '(+ r8 0)))

 (Sar 'rax 4)

 (Sub 'r9 1)

 (Cmp 'r9 'rax)

 (Jl 'err)

 (Sal 'rax 2)

 (Add 'r8 'rax)

 (Mov 'eax (Offset '(+ r8 8)))

 (Sal 'rax 5)

 (Xor 'rax 8))

13.7 A Run-Time for Hoax🔗

First, we extend the value interface to include vectors:

values.h

#ifndef VALUES_H
#define VALUES_H

#include <stdint.h>

/* any abstract value */
typedef int64_t val_t;

typedef enum type_t {
  T_INVALID = -1,
  /* immediates */
  T_INT,
  T_BOOL,
  T_CHAR,
  T_EOF,  
  T_VOID,
  T_EMPTY,
  /* pointers */
  T_BOX,
  T_CONS,
  T_VECT,
  T_STR,
} type_t;

typedef uint32_t val_char_t;
typedef struct val_box_t {
  val_t val;
} val_box_t;
typedef struct val_cons_t {
  val_t snd;
  val_t fst;
} val_cons_t;
typedef struct val_vect_t {
  uint64_t len;
  val_t elems[];
} val_vect_t;
typedef struct val_str_t {
  uint64_t len;
  val_char_t codepoints[];
} val_str_t;

/* return the type of x */
type_t val_typeof(val_t x);

/**
 * Wrap/unwrap values
 *
 * The behavior of unwrap functions are undefined on type mismatch.
 */
int64_t val_unwrap_int(val_t x);
val_t val_wrap_int(int64_t i);
val_t val_wrap_byte(unsigned char b);

int val_unwrap_bool(val_t x);
val_t val_wrap_bool(int b);

val_char_t val_unwrap_char(val_t x);
val_t val_wrap_char(val_char_t b);

val_t val_wrap_eof();

val_t val_wrap_void();

val_box_t* val_unwrap_box(val_t x);
val_t val_wrap_box(val_box_t* b);

val_cons_t* val_unwrap_cons(val_t x);
val_t val_wrap_cons(val_cons_t* c);

val_vect_t* val_unwrap_vect(val_t x);
val_t val_wrap_vect(val_vect_t* c);

val_str_t* val_unwrap_str(val_t x);
val_t val_wrap_str(val_str_t* c);

#endif

The implementation of val_typeof is extended to handle another pointer type:

values.c

#include "types.h"
#include "values.h"

type_t val_typeof(val_t x)
{
  switch (x & ptr_type_mask) {
  case box_type_tag:
    return T_BOX;
  case cons_type_tag:
    return T_CONS;
  case vect_type_tag:
    return T_VECT;
  case str_type_tag:
    return T_STR;
  }

  if ((int_type_mask & x) == int_type_tag)
    return T_INT;
  if ((char_type_mask & x) == char_type_tag)
    return T_CHAR;

  switch (x) {
  case val_true:
  case val_false:
    return T_BOOL;
  case val_eof:
    return T_EOF;
  case val_void:
    return T_VOID;
  case val_empty:
    return T_EMPTY;
  }

  return T_INVALID;
}

int64_t val_unwrap_int(val_t x)
{
  return x >> int_shift;
}
val_t val_wrap_int(int64_t i)
{
  return (i << int_shift) | int_type_tag;
}
val_t val_wrap_byte(unsigned char b)
{
  return (b << int_shift) | int_type_tag;
}

int val_unwrap_bool(val_t x)
{
  return x == val_true;
}
val_t val_wrap_bool(int b)
{
  return b ? val_true : val_false;
}

val_char_t val_unwrap_char(val_t x)
{
  return (val_char_t)(x >> char_shift);
}
val_t val_wrap_char(val_char_t c)
{
  return (((val_t)c) << char_shift) | char_type_tag;
}

val_t val_wrap_eof(void)
{
  return val_eof;
}

val_t val_wrap_void(void)
{
  return val_void;
}

val_box_t* val_unwrap_box(val_t x)
{
  return (val_box_t *)(x ^ box_type_tag);
}
val_t val_wrap_box(val_box_t* b)
{
  return ((val_t)b) | box_type_tag;
}

val_cons_t* val_unwrap_cons(val_t x)
{
  return (val_cons_t *)(x ^ cons_type_tag);
}
val_t val_wrap_cons(val_cons_t *c)
{
  return ((val_t)c) | cons_type_tag;
}

val_vect_t* val_unwrap_vect(val_t x)
{
  return (val_vect_t *)(x ^ vect_type_tag);
}
val_t val_wrap_vect(val_vect_t *v)
{
  return ((val_t)v) | vect_type_tag;
}

val_str_t* val_unwrap_str(val_t x)
{
  return (val_str_t *)(x ^ str_type_tag);
}
val_t val_wrap_str(val_str_t *v)
{
  return ((val_t)v) | str_type_tag;
}

Printing is updated to handle vectors and strings. Note that printing of strings seems complicated by this code is actually auto-generated from the Unicode specification.

print.c

#include <stdio.h>
#include <inttypes.h>
#include "values.h"

void print_char(val_char_t);
void print_codepoint(val_char_t);
void print_cons(val_cons_t *);
void print_vect(val_vect_t*);
void print_str(val_str_t*);
void print_str_char(val_char_t);
void print_result_interior(val_t);
int utf8_encode_char(val_char_t, char *);

void print_result(val_t x)
{
  switch (val_typeof(x)) {
  case T_INT:
    printf("%" PRId64, val_unwrap_int(x));
    break;
  case T_BOOL:
    printf(val_unwrap_bool(x) ? "#t" : "#f");
    break;
  case T_CHAR:
    print_char(val_unwrap_char(x));
    break;
  case T_EOF:
    printf("#<eof>");
    break;
  case T_VOID:
    break;
  case T_EMPTY:
  case T_BOX:
  case T_CONS:
  case T_VECT:    
    printf("'");
    print_result_interior(x);
    break;
  case T_STR:
    putchar('"');
    print_str(val_unwrap_str(x));
    putchar('"');
    break;    
  case T_INVALID:
    printf("internal error");
  }
}

void print_result_interior(val_t x)
{
  switch (val_typeof(x)) {
  case T_EMPTY:
    printf("()");
    break;
  case T_BOX:
    printf("#&");
    print_result_interior(val_unwrap_box(x)->val);
    break;
  case T_CONS:
    printf("(");
    print_cons(val_unwrap_cons(x));
    printf(")");
    break;
  case T_VECT:
    print_vect(val_unwrap_vect(x));
    break;    
  default:
    print_result(x);
  }
}

void print_vect(val_vect_t *v)
{
  uint64_t i;

  if (!v) { printf("#()"); return; }

  printf("#(");
  for (i = 0; i < v->len; ++i) {
    print_result_interior(v->elems[i]);

    if (i < v->len - 1)
      putchar(' ');
  }
  printf(")");
}

void print_cons(val_cons_t *cons)
{
  print_result_interior(cons->fst);

  switch (val_typeof(cons->snd)) {
  case T_EMPTY:
    // nothing
    break;
  case T_CONS:
    printf(" ");
    print_cons(val_unwrap_cons(cons->snd));
    break;
  default:
    printf(" . ");
    print_result_interior(cons->snd);
    break;
  }
}

void print_str(val_str_t* s)
{
  if (!s) return;
  uint64_t i;
  for (i = 0; i < s->len; ++i)
    print_str_char(s->codepoints[i]);
}

void print_str_char_u(val_char_t c)
{
  printf("\\u%04X", c);
}

void print_str_char_U(val_char_t c)
{
  printf("\\U%08X", c);
}

void print_str_char(val_char_t c)
{
  switch (c) {
  case 0 ... 6:
    print_str_char_u(c);
    break;
  case 7:
    printf("\\a");
    break;
  case 8:
    printf("\\b");
    break;
  case 9:
    printf("\\t");
    break;
  case 10:
    printf("\\n");
    break;
  case 11:
    printf("\\v");
    break;
  case 12:
    printf("\\f");
    break;
  case 13:
    printf("\\r");
    break;
  case 14 ... 26:
    print_str_char_u(c);
    break;
  case 27:
    printf("\\e");
    break;
  case 28 ... 31:
    print_str_char_u(c);
    break;
  case 34:
    printf("\\\"");
    break;
  case 39:
    printf("'");
    break;
  case 92:
    printf("\\\\");
    break;
  case 127 ... 159:
  case 173 ... 173:
  case 888 ... 889:
  case 896 ... 899:
  case 907 ... 907:
  case 909 ... 909:
  case 930 ... 930:
  case 1328 ... 1328:
  case 1367 ... 1368:
  case 1376 ... 1376:
  case 1416 ... 1416:
  case 1419 ... 1420:
  case 1424 ... 1424:
  case 1480 ... 1487:
  case 1515 ... 1519:
  case 1525 ... 1541:
  case 1564 ... 1565:
  case 1757 ... 1757:
  case 1806 ... 1807:
  case 1867 ... 1868:
  case 1970 ... 1983:
  case 2043 ... 2047:
  case 2094 ... 2095:
  case 2111 ... 2111:
  case 2140 ... 2141:
  case 2143 ... 2207:
  case 2227 ... 2275:
  case 2436 ... 2436:
  case 2445 ... 2446:
  case 2449 ... 2450:
  case 2473 ... 2473:
  case 2481 ... 2481:
  case 2483 ... 2485:
  case 2490 ... 2491:
  case 2501 ... 2502:
  case 2505 ... 2506:
  case 2511 ... 2518:
  case 2520 ... 2523:
  case 2526 ... 2526:
  case 2532 ... 2533:
  case 2556 ... 2560:
  case 2564 ... 2564:
  case 2571 ... 2574:
  case 2577 ... 2578:
  case 2601 ... 2601:
  case 2609 ... 2609:
  case 2612 ... 2612:
  case 2615 ... 2615:
  case 2618 ... 2619:
  case 2621 ... 2621:
  case 2627 ... 2630:
  case 2633 ... 2634:
  case 2638 ... 2640:
  case 2642 ... 2648:
  case 2653 ... 2653:
  case 2655 ... 2661:
  case 2678 ... 2688:
  case 2692 ... 2692:
  case 2702 ... 2702:
  case 2706 ... 2706:
  case 2729 ... 2729:
  case 2737 ... 2737:
  case 2740 ... 2740:
  case 2746 ... 2747:
  case 2758 ... 2758:
  case 2762 ... 2762:
  case 2766 ... 2767:
  case 2769 ... 2783:
  case 2788 ... 2789:
  case 2802 ... 2816:
  case 2820 ... 2820:
  case 2829 ... 2830:
  case 2833 ... 2834:
  case 2857 ... 2857:
  case 2865 ... 2865:
  case 2868 ... 2868:
  case 2874 ... 2875:
  case 2885 ... 2886:
  case 2889 ... 2890:
  case 2894 ... 2901:
  case 2904 ... 2907:
  case 2910 ... 2910:
  case 2916 ... 2917:
  case 2936 ... 2945:
  case 2948 ... 2948:
  case 2955 ... 2957:
  case 2961 ... 2961:
  case 2966 ... 2968:
  case 2971 ... 2971:
  case 2973 ... 2973:
  case 2976 ... 2978:
  case 2981 ... 2983:
  case 2987 ... 2989:
  case 3002 ... 3005:
  case 3011 ... 3013:
  case 3017 ... 3017:
  case 3022 ... 3023:
  case 3025 ... 3030:
  case 3032 ... 3045:
  case 3067 ... 3071:
  case 3076 ... 3076:
  case 3085 ... 3085:
  case 3089 ... 3089:
  case 3113 ... 3113:
  case 3130 ... 3132:
  case 3141 ... 3141:
  case 3145 ... 3145:
  case 3150 ... 3156:
  case 3159 ... 3159:
  case 3162 ... 3167:
  case 3172 ... 3173:
  case 3184 ... 3191:
  case 3200 ... 3200:
  case 3204 ... 3204:
  case 3213 ... 3213:
  case 3217 ... 3217:
  case 3241 ... 3241:
  case 3252 ... 3252:
  case 3258 ... 3259:
  case 3269 ... 3269:
  case 3273 ... 3273:
  case 3278 ... 3284:
  case 3287 ... 3293:
  case 3295 ... 3295:
  case 3300 ... 3301:
  case 3312 ... 3312:
  case 3315 ... 3328:
  case 3332 ... 3332:
  case 3341 ... 3341:
  case 3345 ... 3345:
  case 3387 ... 3388:
  case 3397 ... 3397:
  case 3401 ... 3401:
  case 3407 ... 3414:
  case 3416 ... 3423:
  case 3428 ... 3429:
  case 3446 ... 3448:
  case 3456 ... 3457:
  case 3460 ... 3460:
  case 3479 ... 3481:
  case 3506 ... 3506:
  case 3516 ... 3516:
  case 3518 ... 3519:
  case 3527 ... 3529:
  case 3531 ... 3534:
  case 3541 ... 3541:
  case 3543 ... 3543:
  case 3552 ... 3557:
  case 3568 ... 3569:
  case 3573 ... 3584:
  case 3643 ... 3646:
  case 3676 ... 3712:
  case 3715 ... 3715:
  case 3717 ... 3718:
  case 3721 ... 3721:
  case 3723 ... 3724:
  case 3726 ... 3731:
  case 3736 ... 3736:
  case 3744 ... 3744:
  case 3748 ... 3748:
  case 3750 ... 3750:
  case 3752 ... 3753:
  case 3756 ... 3756:
  case 3770 ... 3770:
  case 3774 ... 3775:
  case 3781 ... 3781:
  case 3783 ... 3783:
  case 3790 ... 3791:
  case 3802 ... 3803:
  case 3808 ... 3839:
  case 3912 ... 3912:
  case 3949 ... 3952:
  case 3992 ... 3992:
  case 4029 ... 4029:
  case 4045 ... 4045:
  case 4059 ... 4095:
  case 4294 ... 4294:
  case 4296 ... 4300:
  case 4302 ... 4303:
  case 4681 ... 4681:
  case 4686 ... 4687:
  case 4695 ... 4695:
  case 4697 ... 4697:
  case 4702 ... 4703:
  case 4745 ... 4745:
  case 4750 ... 4751:
  case 4785 ... 4785:
  case 4790 ... 4791:
  case 4799 ... 4799:
  case 4801 ... 4801:
  case 4806 ... 4807:
  case 4823 ... 4823:
  case 4881 ... 4881:
  case 4886 ... 4887:
  case 4955 ... 4956:
  case 4989 ... 4991:
  case 5018 ... 5023:
  case 5109 ... 5119:
  case 5789 ... 5791:
  case 5881 ... 5887:
  case 5901 ... 5901:
  case 5909 ... 5919:
  case 5943 ... 5951:
  case 5972 ... 5983:
  case 5997 ... 5997:
  case 6001 ... 6001:
  case 6004 ... 6015:
  case 6110 ... 6111:
  case 6122 ... 6127:
  case 6138 ... 6143:
  case 6158 ... 6159:
  case 6170 ... 6175:
  case 6264 ... 6271:
  case 6315 ... 6319:
  case 6390 ... 6399:
  case 6431 ... 6431:
  case 6444 ... 6447:
  case 6460 ... 6463:
  case 6465 ... 6467:
  case 6510 ... 6511:
  case 6517 ... 6527:
  case 6572 ... 6575:
  case 6602 ... 6607:
  case 6619 ... 6621:
  case 6684 ... 6685:
  case 6751 ... 6751:
  case 6781 ... 6782:
  case 6794 ... 6799:
  case 6810 ... 6815:
  case 6830 ... 6831:
  case 6847 ... 6911:
  case 6988 ... 6991:
  case 7037 ... 7039:
  case 7156 ... 7163:
  case 7224 ... 7226:
  case 7242 ... 7244:
  case 7296 ... 7359:
  case 7368 ... 7375:
  case 7415 ... 7415:
  case 7418 ... 7423:
  case 7670 ... 7675:
  case 7958 ... 7959:
  case 7966 ... 7967:
  case 8006 ... 8007:
  case 8014 ... 8015:
  case 8024 ... 8024:
  case 8026 ... 8026:
  case 8028 ... 8028:
  case 8030 ... 8030:
  case 8062 ... 8063:
  case 8117 ... 8117:
  case 8133 ... 8133:
  case 8148 ... 8149:
  case 8156 ... 8156:
  case 8176 ... 8177:
  case 8181 ... 8181:
  case 8191 ... 8191:
  case 8203 ... 8207:
  case 8232 ... 8238:
  case 8288 ... 8303:
  case 8306 ... 8307:
  case 8335 ... 8335:
  case 8349 ... 8351:
  case 8382 ... 8399:
  case 8433 ... 8447:
  case 8586 ... 8591:
  case 9211 ... 9215:
  case 9255 ... 9279:
  case 9291 ... 9311:
  case 11124 ... 11125:
  case 11158 ... 11159:
  case 11194 ... 11196:
  case 11209 ... 11209:
  case 11218 ... 11263:
  case 11311 ... 11311:
  case 11359 ... 11359:
  case 11508 ... 11512:
  case 11558 ... 11558:
  case 11560 ... 11564:
  case 11566 ... 11567:
  case 11624 ... 11630:
  case 11633 ... 11646:
  case 11671 ... 11679:
  case 11687 ... 11687:
  case 11695 ... 11695:
  case 11703 ... 11703:
  case 11711 ... 11711:
  case 11719 ... 11719:
  case 11727 ... 11727:
  case 11735 ... 11735:
  case 11743 ... 11743:
  case 11843 ... 11903:
  case 11930 ... 11930:
  case 12020 ... 12031:
  case 12246 ... 12271:
  case 12284 ... 12287:
  case 12352 ... 12352:
  case 12439 ... 12440:
  case 12544 ... 12548:
  case 12590 ... 12592:
  case 12687 ... 12687:
  case 12731 ... 12735:
  case 12772 ... 12783:
  case 12831 ... 12831:
  case 13055 ... 13055:
  case 19894 ... 19903:
  case 40909 ... 40959:
  case 42125 ... 42127:
  case 42183 ... 42191:
  case 42540 ... 42559:
  case 42654 ... 42654:
  case 42744 ... 42751:
  case 42895 ... 42895:
  case 42926 ... 42927:
  case 42930 ... 42998:
  case 43052 ... 43055:
  case 43066 ... 43071:
  case 43128 ... 43135:
  case 43205 ... 43213:
  case 43226 ... 43231:
  case 43260 ... 43263:
  case 43348 ... 43358:
  case 43389 ... 43391:
  case 43470 ... 43470:
  case 43482 ... 43485:
  case 43519 ... 43519:
  case 43575 ... 43583:
  case 43598 ... 43599:
  case 43610 ... 43611:
  case 43715 ... 43738:
  case 43767 ... 43776:
  case 43783 ... 43784:
  case 43791 ... 43792:
  case 43799 ... 43807:
  case 43815 ... 43815:
  case 43823 ... 43823:
  case 43872 ... 43875:
  case 43878 ... 43967:
  case 44014 ... 44015:
  case 44026 ... 44031:
  case 55204 ... 55215:
  case 55239 ... 55242:
  case 55292 ... 55295:
  case 57344 ... 63743:
  case 64110 ... 64111:
  case 64218 ... 64255:
  case 64263 ... 64274:
  case 64280 ... 64284:
  case 64311 ... 64311:
  case 64317 ... 64317:
  case 64319 ... 64319:
  case 64322 ... 64322:
  case 64325 ... 64325:
  case 64450 ... 64466:
  case 64832 ... 64847:
  case 64912 ... 64913:
  case 64968 ... 65007:
  case 65022 ... 65023:
  case 65050 ... 65055:
  case 65070 ... 65071:
  case 65107 ... 65107:
  case 65127 ... 65127:
  case 65132 ... 65135:
  case 65141 ... 65141:
  case 65277 ... 65280:
  case 65471 ... 65473:
  case 65480 ... 65481:
  case 65488 ... 65489:
  case 65496 ... 65497:
  case 65501 ... 65503:
  case 65511 ... 65511:
  case 65519 ... 65531:
  case 65534 ... 65535:
    print_str_char_u(c);
    break;
  case 65548 ... 65548:
  case 65575 ... 65575:
  case 65595 ... 65595:
  case 65598 ... 65598:
  case 65614 ... 65615:
  case 65630 ... 65663:
  case 65787 ... 65791:
  case 65795 ... 65798:
  case 65844 ... 65846:
  case 65933 ... 65935:
  case 65948 ... 65951:
  case 65953 ... 65999:
  case 66046 ... 66175:
  case 66205 ... 66207:
  case 66257 ... 66271:
  case 66300 ... 66303:
  case 66340 ... 66351:
  case 66379 ... 66383:
  case 66427 ... 66431:
  case 66462 ... 66462:
  case 66500 ... 66503:
  case 66518 ... 66559:
  case 66718 ... 66719:
  case 66730 ... 66815:
  case 66856 ... 66863:
  case 66916 ... 66926:
  case 66928 ... 67071:
  case 67383 ... 67391:
  case 67414 ... 67423:
  case 67432 ... 67583:
  case 67590 ... 67591:
  case 67593 ... 67593:
  case 67638 ... 67638:
  case 67641 ... 67643:
  case 67645 ... 67646:
  case 67670 ... 67670:
  case 67743 ... 67750:
  case 67760 ... 67839:
  case 67868 ... 67870:
  case 67898 ... 67902:
  case 67904 ... 67967:
  case 68024 ... 68029:
  case 68032 ... 68095:
  case 68100 ... 68100:
  case 68103 ... 68107:
  case 68116 ... 68116:
  case 68120 ... 68120:
  case 68148 ... 68151:
  case 68155 ... 68158:
  case 68168 ... 68175:
  case 68185 ... 68191:
  case 68256 ... 68287:
  case 68327 ... 68330:
  case 68343 ... 68351:
  case 68406 ... 68408:
  case 68438 ... 68439:
  case 68467 ... 68471:
  case 68498 ... 68504:
  case 68509 ... 68520:
  case 68528 ... 68607:
  case 68681 ... 69215:
  case 69247 ... 69631:
  case 69710 ... 69713:
  case 69744 ... 69758:
  case 69821 ... 69821:
  case 69826 ... 69839:
  case 69865 ... 69871:
  case 69882 ... 69887:
  case 69941 ... 69941:
  case 69956 ... 69967:
  case 70007 ... 70015:
  case 70089 ... 70092:
  case 70094 ... 70095:
  case 70107 ... 70112:
  case 70133 ... 70143:
  case 70162 ... 70162:
  case 70206 ... 70319:
  case 70379 ... 70383:
  case 70394 ... 70400:
  case 70404 ... 70404:
  case 70413 ... 70414:
  case 70417 ... 70418:
  case 70441 ... 70441:
  case 70449 ... 70449:
  case 70452 ... 70452:
  case 70458 ... 70459:
  case 70469 ... 70470:
  case 70473 ... 70474:
  case 70478 ... 70486:
  case 70488 ... 70492:
  case 70500 ... 70501:
  case 70509 ... 70511:
  case 70517 ... 70783:
  case 70856 ... 70863:
  case 70874 ... 71039:
  case 71094 ... 71095:
  case 71114 ... 71167:
  case 71237 ... 71247:
  case 71258 ... 71295:
  case 71352 ... 71359:
  case 71370 ... 71839:
  case 71923 ... 71934:
  case 71936 ... 72383:
  case 72441 ... 73727:
  case 74649 ... 74751:
  case 74863 ... 74863:
  case 74869 ... 77823:
  case 78895 ... 92159:
  case 92729 ... 92735:
  case 92767 ... 92767:
  case 92778 ... 92781:
  case 92784 ... 92879:
  case 92910 ... 92911:
  case 92918 ... 92927:
  case 92998 ... 93007:
  case 93018 ... 93018:
  case 93026 ... 93026:
  case 93048 ... 93052:
  case 93072 ... 93951:
  case 94021 ... 94031:
  case 94079 ... 94094:
  case 94112 ... 110591:
  case 110594 ... 113663:
  case 113771 ... 113775:
  case 113789 ... 113791:
  case 113801 ... 113807:
  case 113818 ... 113819:
  case 113824 ... 118783:
  case 119030 ... 119039:
  case 119079 ... 119080:
  case 119155 ... 119162:
  case 119262 ... 119295:
  case 119366 ... 119551:
  case 119639 ... 119647:
  case 119666 ... 119807:
  case 119893 ... 119893:
  case 119965 ... 119965:
  case 119968 ... 119969:
  case 119971 ... 119972:
  case 119975 ... 119976:
  case 119981 ... 119981:
  case 119994 ... 119994:
  case 119996 ... 119996:
  case 120004 ... 120004:
  case 120070 ... 120070:
  case 120075 ... 120076:
  case 120085 ... 120085:
  case 120093 ... 120093:
  case 120122 ... 120122:
  case 120127 ... 120127:
  case 120133 ... 120133:
  case 120135 ... 120137:
  case 120145 ... 120145:
  case 120486 ... 120487:
  case 120780 ... 120781:
  case 120832 ... 124927:
  case 125125 ... 125126:
  case 125143 ... 126463:
  case 126468 ... 126468:
  case 126496 ... 126496:
  case 126499 ... 126499:
  case 126501 ... 126502:
  case 126504 ... 126504:
  case 126515 ... 126515:
  case 126520 ... 126520:
  case 126522 ... 126522:
  case 126524 ... 126529:
  case 126531 ... 126534:
  case 126536 ... 126536:
  case 126538 ... 126538:
  case 126540 ... 126540:
  case 126544 ... 126544:
  case 126547 ... 126547:
  case 126549 ... 126550:
  case 126552 ... 126552:
  case 126554 ... 126554:
  case 126556 ... 126556:
  case 126558 ... 126558:
  case 126560 ... 126560:
  case 126563 ... 126563:
  case 126565 ... 126566:
  case 126571 ... 126571:
  case 126579 ... 126579:
  case 126584 ... 126584:
  case 126589 ... 126589:
  case 126591 ... 126591:
  case 126602 ... 126602:
  case 126620 ... 126624:
  case 126628 ... 126628:
  case 126634 ... 126634:
  case 126652 ... 126703:
  case 126706 ... 126975:
  case 127020 ... 127023:
  case 127124 ... 127135:
  case 127151 ... 127152:
  case 127168 ... 127168:
  case 127184 ... 127184:
  case 127222 ... 127231:
  case 127245 ... 127247:
  case 127279 ... 127279:
  case 127340 ... 127343:
  case 127387 ... 127461:
  case 127491 ... 127503:
  case 127547 ... 127551:
  case 127561 ... 127567:
  case 127570 ... 127743:
  case 127789 ... 127791:
  case 127870 ... 127871:
  case 127951 ... 127955:
  case 127992 ... 127999:
  case 128255 ... 128255:
  case 128331 ... 128335:
  case 128378 ... 128378:
  case 128420 ... 128420:
  case 128579 ... 128580:
  case 128720 ... 128735:
  case 128749 ... 128751:
  case 128756 ... 128767:
  case 128884 ... 128895:
  case 128981 ... 129023:
  case 129036 ... 129039:
  case 129096 ... 129103:
  case 129114 ... 129119:
  case 129160 ... 129167:
  case 129198 ... 131071:
  case 173783 ... 173823:
  case 177973 ... 177983:
  case 178206 ... 194559:
  case 195102 ... 917759:
  case 918000 ... 1114110:
    print_str_char_U(c);
    break;
  default:
    print_codepoint(c);
    break;
  }
}

void print_char(val_char_t c)
{
  printf("#\\");
  switch (c) {
  case 0:
    printf("nul"); break;
  case 8:
    printf("backspace"); break;
  case 9:
    printf("tab"); break;
  case 10:
    printf("newline"); break;
  case 11:
    printf("vtab"); break;
  case 12:
    printf("page"); break;
  case 13:
    printf("return"); break;
  case 32:
    printf("space"); break;
  case 127:
    printf("rubout"); break;
  default:
    print_codepoint(c);
  }
}

void print_codepoint(val_char_t c)
{
  char buffer[5] = {0};
  utf8_encode_char(c, buffer);
  printf("%s", buffer);
}

int utf8_encode_char(val_char_t c, char *buffer)
{
  // Output to buffer using UTF-8 encoding of codepoint
  // https://en.wikipedia.org/wiki/UTF-8
  if (c < 128) {
    buffer[0] = (char) c;
    return 1;
  } else if (c < 2048) {
    buffer[0] =  (char)(c >> 6)       | 192;
    buffer[1] = ((char)       c & 63) | 128;
    return 2;
  } else if (c < 65536) {
    buffer[0] =  (char)(c >> 12)      | 224;
    buffer[1] = ((char)(c >> 6) & 63) | 128;
    buffer[2] = ((char)       c & 63) | 128;
    return 3;
  } else {
    buffer[0] =  (char)(c >> 18)       | 240;
    buffer[1] = ((char)(c >> 12) & 63) | 128;
    buffer[2] = ((char)(c >>  6) & 63) | 128;
    buffer[3] = ((char)        c & 63) | 128;
    return 4;
  }
}