Source file src/syscall/wtf8_windows.go
1 // Copyright 2023 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Windows UTF-16 strings can contain unpaired surrogates, which can't be 6 // decoded into a valid UTF-8 string. This file defines a set of functions 7 // that can be used to encode and decode potentially ill-formed UTF-16 strings 8 // by using the [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). 9 // 10 // WTF-8 is a strict superset of UTF-8, i.e. any string that is 11 // well-formed in UTF-8 is also well-formed in WTF-8 and the content 12 // is unchanged. Also, the conversion never fails and is lossless. 13 // 14 // The benefit of using WTF-8 instead of UTF-8 when decoding a UTF-16 string 15 // is that the conversion is lossless even for ill-formed UTF-16 strings. 16 // This property allows to read an ill-formed UTF-16 string, convert it 17 // to a Go string, and convert it back to the same original UTF-16 string. 18 // 19 // See go.dev/issues/59971 for more info. 20 21 package syscall 22 23 import ( 24 "unicode/utf16" 25 "unicode/utf8" 26 ) 27 28 const ( 29 surr1 = 0xd800 30 surr2 = 0xdc00 31 surr3 = 0xe000 32 33 tx = 0b10000000 34 t3 = 0b11100000 35 maskx = 0b00111111 36 mask3 = 0b00001111 37 38 rune1Max = 1<<7 - 1 39 rune2Max = 1<<11 - 1 40 ) 41 42 // encodeWTF16 returns the potentially ill-formed 43 // UTF-16 encoding of s. 44 func encodeWTF16(s string, buf []uint16) []uint16 { 45 for i := 0; i < len(s); { 46 // Cannot use 'for range s' because it expects valid 47 // UTF-8 runes. 48 r, size := utf8.DecodeRuneInString(s[i:]) 49 if r == utf8.RuneError { 50 // Check if s[i:] contains a valid WTF-8 encoded surrogate. 51 if sc := s[i:]; len(sc) >= 3 && sc[0] == 0xED && 0xA0 <= sc[1] && sc[1] <= 0xBF && 0x80 <= sc[2] && sc[2] <= 0xBF { 52 r = rune(sc[0]&mask3)<<12 + rune(sc[1]&maskx)<<6 + rune(sc[2]&maskx) 53 buf = append(buf, uint16(r)) 54 i += 3 55 continue 56 } 57 } 58 i += size 59 buf = utf16.AppendRune(buf, r) 60 } 61 return buf 62 } 63 64 // decodeWTF16 returns the WTF-8 encoding of 65 // the potentially ill-formed UTF-16 s. 66 func decodeWTF16(s []uint16, buf []byte) []byte { 67 for i := 0; i < len(s); i++ { 68 var ar rune 69 switch r := s[i]; { 70 case r < surr1, surr3 <= r: 71 // normal rune 72 ar = rune(r) 73 case surr1 <= r && r < surr2 && i+1 < len(s) && 74 surr2 <= s[i+1] && s[i+1] < surr3: 75 // valid surrogate sequence 76 ar = utf16.DecodeRune(rune(r), rune(s[i+1])) 77 i++ 78 default: 79 // WTF-8 fallback. 80 // This only handles the 3-byte case of utf8.AppendRune, 81 // as surrogates always fall in that case. 82 ar = rune(r) 83 if ar > utf8.MaxRune { 84 ar = utf8.RuneError 85 } 86 buf = append(buf, t3|byte(ar>>12), tx|byte(ar>>6)&maskx, tx|byte(ar)&maskx) 87 continue 88 } 89 buf = utf8.AppendRune(buf, ar) 90 } 91 return buf 92 } 93