-
Notifications
You must be signed in to change notification settings - Fork 1
/
utf8string.lisp
698 lines (640 loc) · 28.3 KB
/
utf8string.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
(defpackage #:utf8
(:use #:cl)
(:export #:utf8-string))
(in-package #:utf8)
(deftype data () '(simple-array (unsigned-byte 8) (*)))
(deftype index () `(integer 0 (,(1- array-dimension-limit))))
(defclass utf8-string (sequence standard-object)
((%length :accessor utf8-string-length :initarg :length
:reader sequence:length
:type (integer 0 (#.array-dimension-limit)))
(%data :accessor utf8-string-data :initarg :data
:type data)))
(defmethod print-object ((o utf8-string) stream)
(print-unreadable-object (o stream :type t)
;; We do this so that errors while iterating
;; don't result in the entire print failing.
(write-string
(handler-case
;; Build a string, escaping as you go
(with-output-to-string (stream)
(write-char #\" stream)
(map nil (lambda (char)
(when (or (char= char #\")
(char= char #\\))
(write-char #\\ stream))
(write-char char stream))
o)
(write-char #\" stream))
(serious-condition () "<invalid>"))
stream)))
(defmethod make-load-form ((str utf8-string) &optional env)
(declare (ignore env))
;; Don't need to initialize separately since there's no recursion.
`(make-instance 'utf8-string
:data ',(utf8-string-data str)
:length ',(utf8-string-length str)))
;;; NOTE: Several places make arrays initialized to zero
;;; even when this is not obviously necessary; however
;;; explicit initialization ensures that the array is a valid
;;; UTF-8 string (of NULs).
(defun %make-utf8-string (length
&optional (data
(make-array
length
:element-type
'(unsigned-byte 8)
:initial-element 0)))
(make-instance 'utf8-string :length length :data data))
;;; These functions determine what role the given byte
;;; has in the utf8 encoding. byte-1-start-p tests if it
;;; starts a 1-byte codepoint, etc with other n, and
;;; continuation-byte-p tests if it's a later byte in a
;;; codepoint.
(declaim (inline byte-1-start-p byte-2-start-p
byte-3-start-p byte-4-start-p
continuation-byte-p))
(defun byte-1-start-p (byte)
(declare (type (unsigned-byte 8) byte))
(not (logbitp 7 byte)))
(defun byte-2-start-p (byte)
(declare (type (unsigned-byte 8) byte))
(= (ldb (byte 3 5) byte) #b110))
(defun byte-3-start-p (byte)
(declare (type (unsigned-byte 8) byte))
(= (ldb (byte 4 4) byte) #b1110))
(defun byte-4-start-p (byte)
(declare (type (unsigned-byte 8) byte))
(= (ldb (byte 5 3) byte) #b11110))
(defun continuation-byte-p (byte)
(declare (type (unsigned-byte 8) byte))
(= (ldb (byte 2 6) byte) #b10))
;;; Given a start byte, how many bytes is the codepoint?
(defun start-byte-length (byte)
(declare (type (unsigned-byte 8) byte))
(cond ((byte-1-start-p byte) 1)
((byte-2-start-p byte) 2)
((byte-3-start-p byte) 3)
((byte-4-start-p byte) 4)
(t (error "Invalid start byte #x~x" byte))))
(defun codepoint-length (codepoint)
(declare (type (and fixnum (integer 0)) codepoint))
(cond ((< codepoint #x80) 1)
((< codepoint #x800) 2)
((< codepoint #x10000) 3)
((< codepoint #x110000) 4)
(t (error "BUG: Codepoint #x~x out of range" codepoint))))
(defun char-length (char)
(codepoint-length (char-code char)))
;; Given ub8vector data and an index into it, return the index
;; of the beginning of the next character.
;; Does no bounds checking and mostly assumes valid encoding.
(defun next-index (data index)
(declare (type data data) (type index index))
(+ index (start-byte-length (aref data index))))
;; Like the above but backwards.
;; If the provided index is zero, returns -1.
;; Assumes valid encoding.
(defun prev-index (data index)
(declare (type data data) (type index index))
(when (zerop index) (return-from prev-index -1))
;; Just go back until a start byte is encountered.
(loop for i downfrom (1- index)
unless (continuation-byte-p (aref data index))
return i))
;; Given data and an underlying index into it,
;; return the codepoint at that position. No bounds checks.
(defun get-codepoint (data index)
(declare (type data data) (type index index))
(let ((byte0 (aref data index)))
(if (byte-1-start-p byte0)
byte0
(let ((byte1 (aref data (+ index 1))))
(if (byte-2-start-p byte0)
;; Two byte character
;; Take low five of byte0 and low six of byte1
(logior (ldb (byte 6 0) byte1)
(ash (ldb (byte 5 0) byte0) 6))
(let ((byte2 (aref data (+ index 2))))
(if (byte-3-start-p byte0)
;; Three byte character
;; 4 of byte0, 6 of byte1, 6 of byte2
(logior (ldb (byte 6 0) byte2)
(ash (ldb (byte 6 0) byte1) 6)
(ash (ldb (byte 4 0) byte0) 12))
(let ((byte3 (aref data (+ index 3))))
(if (byte-4-start-p byte0)
;; Four byte character
;; 3 of byte0, 6 each of bytes 1-3
(logior (ldb (byte 6 0) byte3)
(ash (ldb (byte 6 0) byte2) 6)
(ash (ldb (byte 6 0) byte1) 12)
(ash (ldb (byte 3 0) byte0) 18))
(error "Invalid encoding at position ~d"
index))))))))))
;; Given data and an underlying index into it,
;; return the character at that position. No bounds checks.
(defun get-char (data index)
(code-char (get-codepoint data index)))
;;; Given a codepoint, data, and an underlying index into it,
;;; write the codepoint into the data at that position.
;;; No bounds checks, may overwrite with impunity.
;;; Return value undefined.
(defun set-codepoint (codepoint data index)
(declare (type data data) (type index index))
(cond ((< codepoint #x80) ; one byte
(setf (aref data index) codepoint))
((< codepoint #x800) ; two bytes
(let ((byte0 (logior #xc0 (ldb (byte 5 6) codepoint)))
(byte1 (logior #x80 (ldb (byte 6 0) codepoint))))
(setf (aref data index) byte0
(aref data (1+ index)) byte1)))
((< codepoint #x10000) ; three byte
(let ((byte0 (logior #xe0 (ldb (byte 4 12) codepoint)))
(byte1 (logior #x80 (ldb (byte 6 6) codepoint)))
(byte2 (logior #x80 (ldb (byte 6 0) codepoint))))
(setf (aref data index) byte0
(aref data (1+ index)) byte1
(aref data (+ index 2)) byte2)))
((< codepoint #x11000) ; four byte
(let ((byte0 (logior #xf0 (ldb (byte 3 18) codepoint)))
(byte1 (logior #x80 (ldb (byte 6 12) codepoint)))
(byte2 (logior #x80 (ldb (byte 6 6) codepoint)))
(byte3 (logior #x80 (ldb (byte 6 0) codepoint))))
(setf (aref data index) byte0
(aref data (1+ index)) byte1
(aref data (+ index 2)) byte2
(aref data (+ index 3)) byte3)))
(t (error "BUG: Codepoint #x~x out of range" codepoint))))
;;; Given a character, data, and an underlying index into it,
;;; set the character at that position. No bounds checks.
;;; Returns the character.
(defun set-char (character data index)
(set-codepoint (char-code character) data index)
character)
;;; Make a new data vector based on an old one.
;;; Bytes before END are copied into the new one.
;;; Then LEN bytes of space are set to #\Nul.
;;; Then the space between start2 and end2 is copied in.
(defun adjust-data (data end len start2 end2)
(let ((result (make-array (+ end len (- end2 start2))
:element-type '(unsigned-byte 8)
:initial-element 0)))
(replace result data :end1 end :end2 end)
(replace result data :start1 (+ end len)
:start2 start2 :end2 end2)
result))
;;; Like the above, but get the sequence as an argument,
;;; and resize the data if necessary (i.e. if the new and
;;; old characters have different lengths).
;;; Resizing is, of course, hells of slow.
(defun set-data-char (new sequence byte-index)
(let* ((data (utf8-string-data sequence))
(byte (aref data byte-index))
(old-length (start-byte-length byte))
(new-length (char-length new)))
(unless (= old-length new-length)
;; Apocalyptically slow case: Resize.
(setf data
(adjust-data data byte-index new-length
(+ byte-index old-length) (length data))
(utf8-string-data sequence)
data))
;; Now write in the codepoint.
(set-char new data byte-index)))
;;; Given the index of a character, return an index into the
;;; underlying data (provided). This basically has to iterate
;;; through the sequence, so it's linear time probably.
;;; If passed the length of the string, returns the length of
;;; the data.
;;; The optional parameters are a character index and
;;; corresponding byte index; this can be used to start the
;;; iteration a bit later in the string and save time.
(defun char-index (data index
&optional (start-char 0) (start-byte 0))
(loop for r = start-byte then (next-index data r)
repeat (- index start-char)
finally (return r)))
;;; Given a data vector and a codepoint, fill the vector with
;;; that codepoint. It's assumed to be the correct length.
;;; Return the data vector.
(defun fill-vec-with-codepoint (data codepoint
&optional (start-byte 0)
(end-byte (length data)))
(declare (type data data)
(type (and fixnum (integer 0)) codepoint))
(cond ((< codepoint #x80) ; one byte
(fill data codepoint :start start-byte :end end-byte))
((< codepoint #x800) ; two byte
(let ((byte0 (logior #xc0 (ldb (byte 5 6) codepoint)))
(byte1 (logior #x80 (ldb (byte 6 0) codepoint))))
(loop for i from start-byte below end-byte by 2
do (setf (aref data i) byte0
(aref data (1+ i)) byte1))))
((< codepoint #x10000) ; three byte
(let ((byte0 (logior #xe0 (ldb (byte 4 12) codepoint)))
(byte1 (logior #x80 (ldb (byte 6 6) codepoint)))
(byte2 (logior #x80 (ldb (byte 6 0) codepoint))))
(loop for i from start-byte below end-byte by 3
do (setf (aref data i) byte0
(aref data (1+ i)) byte1
(aref data (+ i 2)) byte2))))
((< codepoint #x11000) ; four byte
(let ((byte0 (logior #xf0 (ldb (byte 3 18) codepoint)))
(byte1 (logior #x80 (ldb (byte 6 12) codepoint)))
(byte2 (logior #x80 (ldb (byte 6 6) codepoint)))
(byte3 (logior #x80 (ldb (byte 6 0) codepoint))))
(loop for i from start-byte below end-byte by 4
do (setf (aref data i) byte0
(aref data (1+ i)) byte1
(aref data (+ i 2)) byte2
(aref data (+ i 3)) byte3))))
(t (error "BUG: Codepoint #x~x out of range" codepoint)))
data)
;;; Like the above but with a character.
;;; Return the data vector.
(defun fill-vec-with-char (data char
&optional (start-byte 0)
(end-byte (length data)))
(fill-vec-with-codepoint data (char-code char)
start-byte end-byte))
;;; Given a data vector and an svector of characters, replace the
;;; data with the vector (beginning at index START).
;;; Assumes correct length.
;;; Returns the vector.
(defun replace-vec-with-charv (data string
&optional (start 0)
(start-byte 0)
(end-byte (length data)))
(declare (type (simple-array * (*)) string)
(type data data))
(loop with byte-index = start-byte
for string-index from start
until (= byte-index end-byte)
do (let ((char (aref string string-index)))
(set-char char data byte-index)
(incf byte-index (char-length char))))
data)
;;; Like the above, but with data vectors, so the bytes are just
;;; copied directly.
(defun replace-vec-with-vec (data data2
&optional (start-byte2 0)
(start-byte 0)
(end-byte (length data)))
(declare (type data data data2))
(replace data data2 :start1 start-byte :end1 end-byte :start2 start-byte2))
;;; Given a vector of characters, return the number
;;; of UTF-8 bytes required to represent it.
(defgeneric sequence-nbytes (sequence))
(defmethod sequence-nbytes ((sequence utf8-string))
(length (utf8-string-data sequence)))
(defmethod sequence-nbytes ((sequence sequence))
(reduce #'+ sequence :key #'char-length))
(defun make-utf8-string-data (length)
(make-array length :element-type '(unsigned-byte 8)
:initial-element 0))
(defun make-utf8-string-data/initial-element (length initial-element)
(let ((char-length (char-length initial-element)))
(if (= char-length 1)
(make-array length
:element-type '(unsigned-byte 8)
:initial-element (char-code initial-element))
(let ((data (make-array (* length char-length)
:element-type '(unsigned-byte 8))))
(fill-vec-with-char data initial-element)))))
(defun make-utf8-string-data/initial-contents (initial-contents)
(if (typep initial-contents 'utf8-string)
;; Special case: just copy
(copy-seq (utf8-string-data initial-contents))
;; Normal case: Slow
(let* ((vec-length (sequence-nbytes initial-contents))
(data (make-array vec-length
:element-type
'(unsigned-byte 8))))
(replace-vec-with-charv data initial-contents))))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; Extensible sequences protocol
;;; Core first
;;;
(defmethod sequence:elt ((sequence utf8-string) index)
(let ((data (utf8-string-data sequence)))
(get-char data (char-index data index))))
(defmethod (setf sequence:elt) (new (sequence utf8-string) index)
(set-data-char new sequence
(char-index (utf8-string-data sequence) index)))
(defmethod sequence:make-sequence-like
((sequence utf8-string) length
&key (initial-element nil iep) (initial-contents nil icp))
(%make-utf8-string
length
(cond ((and iep icp)
(error "supplied both ~s and ~s to ~s"
:initial-element :initial-contents
'sequence:make-sequence-like))
(iep
(make-utf8-string-data/initial-element
length initial-element))
(icp
(unless (= length (length initial-contents))
(error "length mismatch in ~s" 'sequence:make-sequence-like))
(make-utf8-string-data/initial-contents initial-contents))
(t (make-utf8-string-data length)))))
;;; All utf8-strings are "actually adjustable", since the data vector
;;; is indirected. That is, this function always returns its first arg.
(defmethod sequence:adjust-sequence
((sequence utf8-string) length
&key (initial-element nil iep) (initial-contents nil icp))
(cond ((and iep icp)
(error "supplied both ~s and ~s to ~s"
:initial-element :initial-contents
'sequence:adjust-sequence))
(iep
(let ((new-len (* (char-length initial-element) length))
(data (utf8-string-data sequence)))
(if (= new-len (length data))
;; No adjustment necessary: Just write it in
(fill-vec-with-char data initial-element)
;; Make and install a new vector
(let ((new-data
(make-array new-len
:element-type '(unsigned-byte 8))))
(setf (utf8-string-data sequence)
(fill-vec-with-char new-data initial-element))))))
(icp
(let ((new-len (sequence-nbytes initial-contents))
(data (utf8-string-data sequence)))
(if (= new-len (length data))
;; No adjustment necessary
(if (typep initial-contents 'utf8-string)
(replace-vec-with-vec
data (utf8-string-data initial-contents))
(replace-vec-with-charv data initial-contents))
;; Reallocate
(let ((data (make-array new-len
:element-type '(unsigned-byte 8))))
(setf (utf8-string-data sequence)
(if (typep initial-contents 'utf8-string)
(replace-vec-with-vec
data (utf8-string-data initial-contents))
(replace-vec-with-charv
data initial-contents)))))))
(t
(let ((old-length (utf8-string-length sequence))
(data (utf8-string-data sequence)))
(cond ((> length old-length)
;; Make a new data array
;; with the extra space NULd out.
(setf (utf8-string-data sequence)
(adjust-array data (+ (length data)
(- length old-length))
:initial-element 0)))
((< length old-length)
;; Make a new data array
;; preserving only the first LENGTH characters.
(setf (utf8-string-data sequence)
(adjust-array data (char-index data length))))))))
(setf (utf8-string-length sequence) length)
sequence)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;
;;; Iteration protocol
;;;
;;; Iterators are a cons of the character index with the
;;; byte index. Both are necessary due to possible resizing.
;;; The endpoint is just the character index.
(defmacro iterator-char-index (it) `(car ,it))
(defmacro iterator-byte-index (it) `(cdr ,it))
(deftype iterator () '(cons index index))
(defun stri-prev (sequence iterator from-end)
(declare (ignore from-end)
(type iterator iterator))
(let ((data (utf8-string-data sequence)))
(setf (iterator-char-index iterator)
(1- (iterator-char-index iterator))
(iterator-byte-index iterator)
(prev-index data (iterator-byte-index iterator))))
iterator)
(defun stri-next (sequence iterator from-end)
(declare (ignore from-end)
(type iterator iterator))
(let ((data (utf8-string-data sequence)))
(setf (iterator-char-index iterator)
(1+ (iterator-char-index iterator))
(iterator-byte-index iterator)
(next-index data (iterator-byte-index iterator))))
iterator)
(defun stri-endp (sequence iterator limit from-end)
(declare (ignore sequence from-end)
(type iterator iterator) (type index limit))
(= (iterator-char-index iterator) limit))
(defun stri-elt (sequence iterator)
(declare (type iterator iterator))
(get-char (utf8-string-data sequence)
(iterator-byte-index iterator)))
(defun (setf stri-elt) (new sequence iterator)
(declare (type iterator iterator))
(set-data-char new sequence (iterator-byte-index iterator)))
(defun stri-index (sequence iterator)
(declare (ignore sequence)
(type iterator iterator))
(iterator-char-index iterator))
(defun stri-copy (sequence iterator)
(declare (ignore sequence)
(type iterator iterator))
(cons (iterator-char-index iterator)
(iterator-byte-index iterator)))
(defun stri-simple-iterator (sequence from-end start end)
(when (null end) (setf end (length sequence)))
(let ((data (utf8-string-data sequence)))
(if from-end
;; FIXME: Probably broken at the edges
(values (cons (1- end) (char-index data (1- end)))
(1- start) t)
(values (cons start (char-index data start))
end nil))))
(defmethod sequence:make-simple-sequence-iterator
((sequence utf8-string) &key from-end (start 0) end)
(stri-simple-iterator sequence from-end start end))
(defmethod sequence:make-sequence-iterator
((sequence utf8-string) &key from-end (start 0) end)
(multiple-value-bind (it limit fe)
(stri-simple-iterator sequence from-end start end)
(values it limit fe
(if from-end #'stri-prev #'stri-next)
#'stri-endp #'stri-elt #'(setf stri-elt)
#'stri-index #'stri-copy)))
;;; So as far as I can tell from the extensible sequences
;;; paper, iterator-foo are entirely optional for the sequence
;;; programmer: iteration has to go through make-sequence-iterator
;;; and use those functions. However, on SBCL, it looks like some
;;; things do assume the iterator-foo work, such as MAP-INTO.
;;; (e.g. try leaving these out, and then use a utf8-string as
;;; the map-into result)
(defmethod sequence:iterator-step ((sequence utf8-string) iterator from-end)
(if from-end
(stri-prev sequence iterator from-end)
(stri-next sequence iterator from-end)))
(defmethod sequence:iterator-endp
((sequence utf8-string) iterator limit from-end)
(stri-endp sequence iterator limit from-end))
(defmethod sequence:iterator-element ((sequence utf8-string) iterator)
(stri-elt sequence iterator))
(defmethod (setf sequence:iterator-element)
(new (sequence utf8-string) iterator)
(setf (stri-elt sequence iterator) new))
(defmethod sequence:iterator-index ((sequence utf8-string) iterator)
(stri-index sequence iterator))
(defmethod sequence:iterator-copy ((sequence utf8-string) iterator)
(stri-copy sequence iterator))
;;; map, count, find, position are probably fine
;;; however, MAP /could/ allocate four bytes per character
;;; and then shrink at the end; this would save time if
;;; any character is more than one byte, but make the all
;;; one byte case a little slower.
;; Copy the underlying bytes
(defmethod sequence:subseq
((sequence utf8-string) start &optional end)
(let ((end (or end (length sequence)))
(data (utf8-string-data sequence)))
(%make-utf8-string
(- end start)
(let ((start-byte (char-index data start)))
(subseq data start-byte
(char-index data end start start-byte))))))
;; Default method (that is, (subseq seq 0)) would work,
;; but this is even easier and doesn't iterate
(defmethod sequence:copy-seq ((sequence utf8-string))
(%make-utf8-string
(utf8-string-length sequence)
(copy-seq (utf8-string-data sequence))))
;;; While the default method is adequate, we can do better:
;;; Only resize once, and write the bytes in a tighter loop.
(defmethod sequence:fill
((sequence utf8-string) item &key (start 0) end)
(when (null end) (setf end (utf8-string-length sequence)))
(let* ((data (utf8-string-data sequence))
(start-byte (char-index data start))
(end-byte (char-index data end start start-byte))
(new-char-length (char-length item))
(new-area-len (* (- end start) new-char-length)))
(unless (= new-area-len (- end-byte start-byte))
;; We have to resize the vector (slow path)
(setf data
(adjust-data data start-byte new-area-len
end-byte
(length data))
(utf8-string-data sequence) data))
(fill-vec-with-char data item
start-byte (+ start-byte new-area-len)))
sequence)
;;; (n)substitute will be fucky
(defmethod sequence:replace
((s1 utf8-string) (s2 sequence)
&key (start1 0) end1 (start2 0) end2)
;; Do defaulting
(when (null end1) (setf end1 (length s1)))
(when (null end2) (setf end2 (length s2)))
;; Use the smaller sequence length
(let ((len1 (- end1 start1)) (len2 (- end2 start2)))
(cond ((< len1 len2) (setf end2 (+ start2 len1)))
((< len2 len1) (setf end1 (+ start1 len2)))))
(let* ((data (utf8-string-data s1))
(start-byte (char-index data start1))
(end-byte (char-index data end1 start1 start-byte))
;; How many bytes do we need for these characters?
(required-area-len (reduce #'+ s2
:start start2 :end end2
:key #'char-length)))
(unless (= required-area-len (- end-byte start-byte))
;; Resize the data if necessary
(setf data
(adjust-data data start-byte required-area-len
end-byte (length data))
(utf8-string-data s1) data))
;; Actually replace
(replace-vec-with-charv data s2 start2
start-byte
(+ start-byte required-area-len)))
s1)
;;; When s2 is another utf8-string, we can work with bytes
;;; directly and skip encoding/decoding.
(defmethod sequence:replace
((s1 utf8-string) (s2 utf8-string)
&key (start1 0) end1 (start2 0) end2)
;; Do defaulting
(when (null end1) (setf end1 (length s1)))
(when (null end2) (setf end2 (length s2)))
;; Use the smaller sequence length
(let ((len1 (- end1 start1)) (len2 (- end2 start2)))
(cond ((< len1 len2) (setf end2 (+ start2 len1)))
((< len2 len1) (setf end1 (+ start1 len2)))))
(let* ((data1 (utf8-string-data s1))
(start-byte1 (char-index data1 start1))
(end-byte1 (char-index data1 end1 start1 start-byte1))
(data2 (utf8-string-data s2))
(start-byte2 (char-index data2 start2))
(end-byte2 (char-index data2 end2 start2 start-byte2))
(new-len (- end-byte2 start-byte2)))
(unless (= new-len (- end-byte1 start-byte1))
(setf data1
(adjust-data data1 start-byte1 new-len
end-byte1 (length data1))
(utf8-string-data s1) data1))
(replace-vec-with-vec data1 data2 start-byte2
start-byte1
(+ start-byte1 new-len)))
s1)
;;; the default reverse = nreverse copy-seq is fine
;;; NREVERSE: Reverse the bytes, then iterate over
;;; them reversing individual codepoints.
;;; NOTE: Probably a faster way to do this, but I don't
;;; know it.
;;; This function is separated out for debugging
(defun fix-nreverse-data (data)
(declare (type data data))
(loop ;; how many bytes in this codepoint so far
with nbytes = 0
for i from 0 below (length data)
do (aref data i)
if (continuation-byte-p (aref data i))
do (incf nbytes)
else ; reverse codepoint
do (loop for j below (ceiling nbytes 2)
do (rotatef (aref data (- i j))
(aref data (- (+ i j) nbytes))))
(setf nbytes 0))
data)
(defmethod sequence:nreverse ((sequence utf8-string))
(setf (utf8-string-data sequence)
(fix-nreverse-data
(nreverse (utf8-string-data sequence))))
sequence)
;;; Make a sequence of the correct byte length
;;; so we don't have to resize
(defmethod sequence:concatenate ((proto utf8-string)
&rest sequences)
(let* ((lengths (mapcar #'length sequences))
(byte-lengths (mapcar #'sequence-nbytes sequences))
(byte-length (reduce #'+ byte-lengths))
(data (make-array byte-length
:element-type '(unsigned-byte 8)))
(result (%make-utf8-string (reduce #'+ lengths) data)))
(loop with byte-index = 0
for byte-length in byte-lengths
for sequence in sequences
if (typep sequence 'utf8-string)
do (replace-vec-with-vec
data (utf8-string-data sequence)
0 byte-index (+ byte-index byte-length))
else
do (replace-vec-with-charv
data sequence
0 byte-index (+ byte-index byte-length))
do (incf byte-index byte-length))
result))
;; reduce, mismatch, search is ok with default implementation
;; delete will be fucky
;; remove will be fucky but maybe less??
;; delete/remove-duplicates, same shit i guess
;; (stable-)sort, merge, probably bad