Merge remote-tracking branch 'origin/master' into blender2.8
[blender.git] / tests / gtests / blenlib / BLI_string_utf8_test.cc
1 /* Apache License, Version 2.0 */
2
3 #include "testing/testing.h"
4
5 extern "C" {
6 #include "BLI_utildefines.h"
7 #include "BLI_string.h"
8 #include "BLI_string_utf8.h"
9 }
10
11 /* Note that 'common' utf-8 variants of string functions (like copy, etc.) are tested in BLI_string_test.cc
12  * However, tests below are specific utf-8 conformance ones, and since they eat quite their share of lines,
13  * they deserved their own file. */
14
15 /* -------------------------------------------------------------------- */
16 /* stubs */
17
18 extern "C" {
19
20 int mk_wcwidth(wchar_t ucs);
21 int mk_wcswidth(const wchar_t *pwcs, size_t n);
22
23 int mk_wcwidth(wchar_t ucs)
24 {
25         return 0;
26 }
27
28 int mk_wcswidth(const wchar_t *pwcs, size_t n)
29 {
30         return 0;
31 }
32
33 }
34
35 /* -------------------------------------------------------------------- */
36 /* tests */
37
38 /* Each test is made of a 79 bytes (80 with NULL char) string to test, expected string result after
39  * stripping invalid utf8 bytes, and a single-byte string encoded with expected number of errors.
40  *
41  * Based on utf-8 decoder stress-test (https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt)
42  *     by Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> - 2015-08-28 - CC BY 4.0
43  */
44 const char *utf8_invalid_tests[][3] = {
45 //    1  Some correct UTF-8 text
46     {"You should see the Greek word 'kosme':       \"\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5\"                    |",
47      "You should see the Greek word 'kosme':       \"\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5\"                    |", "\x00"},
48
49 //    2  Boundary condition test cases
50 //    Note that those will pass for us, those are not erronĂ©ous unicode code points
51 //    (asside from \x00, which is only valid as string terminator).
52 //    2.1  First possible sequence of a certain length
53     {"2.1.1  1 byte  (U-00000000):        \"\x00\"                                       |",
54      "2.1.1  1 byte  (U-00000000):        \"\"                                       |", "\x01"},
55     {"2.1.2  2 bytes (U-00000080):        \"\xc2\x80\"                                      |",
56      "2.1.2  2 bytes (U-00000080):        \"\xc2\x80\"                                      |", "\x00"},
57     {"2.1.3  3 bytes (U-00000800):        \"\xe0\xa0\x80\"                                     |",
58      "2.1.3  3 bytes (U-00000800):        \"\xe0\xa0\x80\"                                     |", "\x00"},
59     {"2.1.4  4 bytes (U-00010000):        \"\xf0\x90\x80\x80\"                                    |",
60      "2.1.4  4 bytes (U-00010000):        \"\xf0\x90\x80\x80\"                                    |", "\x00"},
61     {"2.1.5  5 bytes (U-00200000):        \"\xf8\x88\x80\x80\x80\"                                   |",
62      "2.1.5  5 bytes (U-00200000):        \"\xf8\x88\x80\x80\x80\"                                   |", "\x00"},
63     {"2.1.6  6 bytes (U-04000000):        \"\xfc\x84\x80\x80\x80\x80\"                                  |",
64      "2.1.6  6 bytes (U-04000000):        \"\xfc\x84\x80\x80\x80\x80\"                                  |", "\x00"},
65 //    2.2  Last possible sequence of a certain length
66     {"2.2.1  1 byte  (U-0000007F):        \"\x7f\"                                       |",
67      "2.2.1  1 byte  (U-0000007F):        \"\x7f\"                                       |", "\x00"},
68     {"2.2.2  2 bytes (U-000007FF):        \"\xdf\xbf\"                                      |",
69      "2.2.2  2 bytes (U-000007FF):        \"\xdf\xbf\"                                      |", "\x00"},
70     {"2.2.3  3 bytes (U-0000FFFF):        \"\xef\xbf\xbf\"                                     |",
71      "2.2.3  3 bytes (U-0000FFFF):        \"\"                                     |", "\x03"},  /* matches one of 5.3 sequences... */
72     {"2.2.4  4 bytes (U-001FFFFF):        \"\xf7\xbf\xbf\xbf\"                                    |",
73      "2.2.4  4 bytes (U-001FFFFF):        \"\xf7\xbf\xbf\xbf\"                                    |", "\x00"},
74     {"2.2.5  5 bytes (U-03FFFFFF):        \"\xfb\xbf\xbf\xbf\xbf\"                                   |",
75      "2.2.5  5 bytes (U-03FFFFFF):        \"\xfb\xbf\xbf\xbf\xbf\"                                   |", "\x00"},
76     {"2.2.6  6 bytes (U-7FFFFFFF):        \"\xfd\xbf\xbf\xbf\xbf\xbf\"                                  |",
77      "2.2.6  6 bytes (U-7FFFFFFF):        \"\xfd\xbf\xbf\xbf\xbf\xbf\"                                  |", "\x00"},
78 //    2.3  Other boundary conditions
79     {"2.3.1  U-0000D7FF = ed 9f bf = \"\xed\x9f\xbf\"                                          |",
80      "2.3.1  U-0000D7FF = ed 9f bf = \"\xed\x9f\xbf\"                                          |", "\x00"},
81     {"2.3.2  U-0000E000 = ee 80 80 = \"\xee\x80\x80\"                                          |",
82      "2.3.2  U-0000E000 = ee 80 80 = \"\xee\x80\x80\"                                          |", "\x00"},
83     {"2.3.3  U-0000FFFD = ef bf bd = \"\xef\xbf\xbd\"                                          |",
84      "2.3.3  U-0000FFFD = ef bf bd = \"\xef\xbf\xbd\"                                          |", "\x00"},
85     {"2.3.4  U-0010FFFF = f4 8f bf bf = \"\xf4\x8f\xbf\xbf\"                                      |",
86      "2.3.4  U-0010FFFF = f4 8f bf bf = \"\xf4\x8f\xbf\xbf\"                                      |", "\x00"},
87     {"2.3.5  U-00110000 = f4 90 80 80 = \"\xf4\x90\x80\x80\"                                      |",
88      "2.3.5  U-00110000 = f4 90 80 80 = \"\xf4\x90\x80\x80\"                                      |", "\x00"},
89
90 //    3  Malformed sequences
91 //    3.1  Unexpected continuation bytes
92 //         Each unexpected continuation byte should be separately signaled as a malformed sequence of its own.
93     {"3.1.1  First continuation byte 0x80: \"\x80\"                                      |",
94      "3.1.1  First continuation byte 0x80: \"\"                                      |", "\x01"},
95     {"3.1.2  Last  continuation byte 0xbf: \"\xbf\"                                      |",
96      "3.1.2  Last  continuation byte 0xbf: \"\"                                      |", "\x01"},
97     {"3.1.3  2 continuation bytes: \"\x80\xbf\"                                             |",
98      "3.1.3  2 continuation bytes: \"\"                                             |", "\x02"},
99     {"3.1.4  3 continuation bytes: \"\x80\xbf\x80\"                                            |",
100      "3.1.4  3 continuation bytes: \"\"                                            |", "\x03"},
101     {"3.1.5  4 continuation bytes: \"\x80\xbf\x80\xbf\"                                           |",
102      "3.1.5  4 continuation bytes: \"\"                                           |", "\x04"},
103     {"3.1.6  5 continuation bytes: \"\x80\xbf\x80\xbf\x80\"                                          |",
104      "3.1.6  5 continuation bytes: \"\"                                          |", "\x05"},
105     {"3.1.7  6 continuation bytes: \"\x80\xbf\x80\xbf\x80\xbf\"                                         |",
106      "3.1.7  6 continuation bytes: \"\"                                         |", "\x06"},
107     {"3.1.8  7 continuation bytes: \"\x80\xbf\x80\xbf\x80\xbf\x80\"                                        |",
108      "3.1.8  7 continuation bytes: \"\"                                        |", "\x07"},
109 //    3.1.9  Sequence of all 64 possible continuation bytes (0x80-0xbf):            |
110     {"3.1.9      \"\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
111                   "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
112                   "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
113                   "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\" |",
114      "3.1.9      \"\" |", "\x40"},
115 //    3.2  Lonely start characters
116 //    3.2.1  All 32 first bytes of 2-byte sequences (0xc0-0xdf), each followed by a space character:
117     {"3.2.1      \"\xc0 \xc1 \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf "
118                   "\xd0 \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \" |",
119      "3.2.1      \"                                \" |", "\x20"},
120 //    3.2.2  All 16 first bytes of 3-byte sequences (0xe0-0xef), each followed by a space character:
121     {"3.2.2      \"\xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \"                                 |",
122      "3.2.2      \"                \"                                 |", "\x10"},
123 //    3.2.3  All 8 first bytes of 4-byte sequences (0xf0-0xf7), each followed by a space character:
124     {"3.2.3      \"\xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \"                                                 |",
125      "3.2.3      \"        \"                                                 |", "\x08"},
126 //    3.2.4  All 4 first bytes of 5-byte sequences (0xf8-0xfb), each followed by a space character:
127     {"3.2.4      \"\xf8 \xf9 \xfa \xfb \"                                                         |",
128      "3.2.4      \"    \"                                                         |", "\x04"},
129 //    3.2.5  All 2 first bytes of 6-byte sequences (0xfc-0xfd), each followed by a space character:
130     {"3.2.4      \"\xfc \xfd \"                                                             |",
131      "3.2.4      \"  \"                                                             |", "\x02"},
132 //    3.3  Sequences with last continuation byte missing
133 //         All bytes of an incomplete sequence should be signaled as a single malformed sequence,
134 //         i.e., you should see only a single replacement character in each of the next 10 tests.
135 //         (Characters as in section 2)
136     {"3.3.1  2-byte sequence with last byte missing (U+0000):     \"\xc0\"               |",
137      "3.3.1  2-byte sequence with last byte missing (U+0000):     \"\"               |", "\x01"},
138     {"3.3.2  3-byte sequence with last byte missing (U+0000):     \"\xe0\x80\"              |",
139      "3.3.2  3-byte sequence with last byte missing (U+0000):     \"\"              |", "\x02"},
140     {"3.3.3  4-byte sequence with last byte missing (U+0000):     \"\xf0\x80\x80\"             |",
141      "3.3.3  4-byte sequence with last byte missing (U+0000):     \"\"             |", "\x03"},
142     {"3.3.4  5-byte sequence with last byte missing (U+0000):     \"\xf8\x80\x80\x80\"            |",
143      "3.3.4  5-byte sequence with last byte missing (U+0000):     \"\"            |", "\x04"},
144     {"3.3.5  6-byte sequence with last byte missing (U+0000):     \"\xfc\x80\x80\x80\x80\"           |",
145      "3.3.5  6-byte sequence with last byte missing (U+0000):     \"\"           |", "\x05"},
146     {"3.3.6  2-byte sequence with last byte missing (U-000007FF): \"\xdf\"               |",
147      "3.3.6  2-byte sequence with last byte missing (U-000007FF): \"\"               |", "\x01"},
148     {"3.3.7  3-byte sequence with last byte missing (U-0000FFFF): \"\xef\xbf\"              |",
149      "3.3.7  3-byte sequence with last byte missing (U-0000FFFF): \"\"              |", "\x02"},
150     {"3.3.8  4-byte sequence with last byte missing (U-001FFFFF): \"\xf7\xbf\xbf\"             |",
151      "3.3.8  4-byte sequence with last byte missing (U-001FFFFF): \"\"             |", "\x03"},
152     {"3.3.9  5-byte sequence with last byte missing (U-03FFFFFF): \"\xfb\xbf\xbf\xbf\"            |",
153      "3.3.9  5-byte sequence with last byte missing (U-03FFFFFF): \"\"            |", "\x04"},
154     {"3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): \"\xfd\xbf\xbf\xbf\xbf\"           |",
155      "3.3.10 6-byte sequence with last byte missing (U-7FFFFFFF): \"\"           |", "\x05"},
156 //    3.4  Concatenation of incomplete sequences
157 //         All the 10 sequences of 3.3 concatenated, you should see 10 malformed sequences being signaled:
158     {"3.4      \"\xc0\xe0\x80\xf0\x80\x80\xf8\x80\x80\x80\xfc\x80\x80\x80\x80"
159                 "\xdf\xef\xbf\xf7\xbf\xbf\xfb\xbf\xbf\xbf\xfd\xbf\xbf\xbf\xbf\""
160                 "                                     |",
161      "3.4      \"\"                                     |", "\x1e"},
162 //    3.5  Impossible bytes
163 //         The following two bytes cannot appear in a correct UTF-8 string
164     {"3.5.1  fe = \"\xfe\"                                                               |",
165      "3.5.1  fe = \"\"                                                               |", "\x01"},
166     {"3.5.2  ff = \"\xff\"                                                               |",
167      "3.5.2  ff = \"\"                                                               |", "\x01"},
168     {"3.5.3  fe fe ff ff = \"\xfe\xfe\xff\xff\"                                                   |",
169      "3.5.3  fe fe ff ff = \"\"                                                   |", "\x04"},
170
171 //    4  Overlong sequences
172 //       The following sequences are not malformed according to the letter of the Unicode 2.0 standard.
173 //       However, they are longer then necessary and a correct UTF-8 encoder is not allowed to produce them.
174 //       A "safe UTF-8 decoder" should reject them just like malformed sequences for two reasons:
175 //       (1) It helps to debug applications if overlong sequences are not treated as valid representations
176 //       of characters, because this helps to spot problems more quickly. (2) Overlong sequences provide
177 //       alternative representations of characters, that could maliciously be used to bypass filters that check
178 //       only for ASCII characters. For instance, a 2-byte encoded line feed (LF) would not be caught by a
179 //       line counter that counts only 0x0a bytes, but it would still be processed as a line feed by an unsafe
180 //       UTF-8 decoder later in the pipeline. From a security point of view, ASCII compatibility of UTF-8
181 //       sequences means also, that ASCII characters are *only* allowed to be represented by ASCII bytes
182 //       in the range 0x00-0x7f. To ensure this aspect of ASCII compatibility, use only "safe UTF-8 decoders"
183 //       that reject overlong UTF-8 sequences for which a shorter encoding exists.
184 //
185 //    4.1  Examples of an overlong ASCII character
186 //         With a safe UTF-8 decoder, all of the following five overlong representations of the ASCII character
187 //         slash ("/") should be rejected like a malformed UTF-8 sequence, for instance by substituting it with
188 //         a replacement character. If you see a slash below, you do not have a safe UTF-8 decoder!
189     {"4.1.1  U+002F     = c0 af             = \"\xc0\xaf\"                                  |",
190      "4.1.1  U+002F     = c0 af             = \"\"                                  |", "\x02"},
191     {"4.1.2  U+002F     = e0 80 af          = \"\xe0\x80\xaf\"                                 |",
192      "4.1.2  U+002F     = e0 80 af          = \"\"                                 |", "\x03"},
193     {"4.1.3  U+002F     = f0 80 80 af       = \"\xf0\x80\x80\xaf\"                                |",
194      "4.1.3  U+002F     = f0 80 80 af       = \"\"                                |", "\x04"},
195     {"4.1.4  U+002F     = f8 80 80 80 af    = \"\xf8\x80\x80\x80\xaf\"                               |",
196      "4.1.4  U+002F     = f8 80 80 80 af    = \"\"                               |", "\x05"},
197     {"4.1.5  U+002F     = fc 80 80 80 80 af = \"\xfc\x80\x80\x80\x80\xaf\"                              |",
198      "4.1.5  U+002F     = fc 80 80 80 80 af = \"\"                              |", "\x06"},
199 //    4.2  Maximum overlong sequences
200 //         Below you see the highest Unicode value that is still resulting in an overlong sequence if represented
201 //         with the given number of bytes. This is a boundary test for safe UTF-8 decoders. All five characters
202 //         should be rejected like malformed UTF-8 sequences.
203     {"4.2.1  U-0000007F = c1 bf             = \"\xc1\xbf\"                                  |",
204      "4.2.1  U-0000007F = c1 bf             = \"\"                                  |", "\x02"},
205     {"4.2.2  U-000007FF = e0 9f bf          = \"\xe0\x9f\xbf\"                                 |",
206      "4.2.2  U-000007FF = e0 9f bf          = \"\"                                 |", "\x03"},
207     {"4.2.3  U-0000FFFF = f0 8f bf bf       = \"\xf0\x8f\xbf\xbf\"                                |",
208      "4.2.3  U-0000FFFF = f0 8f bf bf       = \"\"                                |", "\x04"},
209     {"4.2.4  U-001FFFFF = f8 87 bf bf bf    = \"\xf8\x87\xbf\xbf\xbf\"                               |",
210      "4.2.4  U-001FFFFF = f8 87 bf bf bf    = \"\"                               |", "\x05"},
211     {"4.2.5  U+0000     = fc 83 bf bf bf bf = \"\xfc\x83\xbf\xbf\xbf\xbf\"                              |",
212      "4.2.5  U+0000     = fc 83 bf bf bf bf = \"\"                              |", "\x06"},
213 //    4.3  Overlong representation of the NUL character
214 //         The following five sequences should also be rejected like malformed UTF-8 sequences and should not be
215 //         treated like the ASCII NUL character.
216     {"4.3.1  U+0000     = c0 80             = \"\xc0\x80\"                                  |",
217      "4.3.1  U+0000     = c0 80             = \"\"                                  |", "\x02"},
218     {"4.3.2  U+0000     = e0 80 80          = \"\xe0\x80\x80\"                                 |",
219      "4.3.2  U+0000     = e0 80 80          = \"\"                                 |", "\x03"},
220     {"4.3.3  U+0000     = f0 80 80 80       = \"\xf0\x80\x80\x80\"                                |",
221      "4.3.3  U+0000     = f0 80 80 80       = \"\"                                |", "\x04"},
222     {"4.3.4  U+0000     = f8 80 80 80 80    = \"\xf8\x80\x80\x80\x80\"                               |",
223      "4.3.4  U+0000     = f8 80 80 80 80    = \"\"                               |", "\x05"},
224     {"4.3.5  U+0000     = fc 80 80 80 80 80 = \"\xfc\x80\x80\x80\x80\x80\"                              |",
225      "4.3.5  U+0000     = fc 80 80 80 80 80 = \"\"                              |", "\x06"},
226
227 //    5  Illegal code positions
228 //       The following UTF-8 sequences should be rejected like malformed sequences, because they never represent
229 //       valid ISO 10646 characters and a UTF-8 decoder that accepts them might introduce security problems
230 //       comparable to overlong UTF-8 sequences.
231 //    5.1 Single UTF-16 surrogates
232     {"5.1.1  U+D800 = ed a0 80 = \"\xed\xa0\x80\"                                              |",
233      "5.1.1  U+D800 = ed a0 80 = \"\"                                              |", "\x03"},
234     {"5.1.2  U+DB7F = ed ad bf = \"\xed\xad\xbf\"                                              |",
235      "5.1.2  U+DB7F = ed ad bf = \"\"                                              |", "\x03"},
236     {"5.1.3  U+DB80 = ed ae 80 = \"\xed\xae\x80\"                                              |",
237      "5.1.3  U+DB80 = ed ae 80 = \"\"                                              |", "\x03"},
238     {"5.1.4  U+DBFF = ed af bf = \"\xed\xaf\xbf\"                                              |",
239      "5.1.4  U+DBFF = ed af bf = \"\"                                              |", "\x03"},
240     {"5.1.5  U+DC00 = ed b0 80 = \"\xed\xb0\x80\"                                              |",
241      "5.1.5  U+DC00 = ed b0 80 = \"\"                                              |", "\x03"},
242     {"5.1.6  U+DF80 = ed be 80 = \"\xed\xbe\x80\"                                              |",
243      "5.1.6  U+DF80 = ed be 80 = \"\"                                              |", "\x03"},
244     {"5.1.7  U+DFFF = ed bf bf = \"\xed\xbf\xbf\"                                              |",
245      "5.1.7  U+DFFF = ed bf bf = \"\"                                              |", "\x03"},
246 //    5.2 Paired UTF-16 surrogates
247     {"5.2.1  U+D800 U+DC00 = ed a0 80 ed b0 80 = \"\xed\xa0\x80\xed\xb0\x80\"                           |",
248      "5.2.1  U+D800 U+DC00 = ed a0 80 ed b0 80 = \"\"                           |", "\x06"},
249     {"5.2.2  U+D800 U+DFFF = ed a0 80 ed bf bf = \"\xed\xa0\x80\xed\xbf\xbf\"                           |",
250      "5.2.2  U+D800 U+DFFF = ed a0 80 ed bf bf = \"\"                           |", "\x06"},
251     {"5.2.3  U+DB7F U+DC00 = ed ad bf ed b0 80 = \"\xed\xad\xbf\xed\xb0\x80\"                           |",
252      "5.2.3  U+DB7F U+DC00 = ed ad bf ed b0 80 = \"\"                           |", "\x06"},
253     {"5.2.4  U+DB7F U+DFFF = ed ad bf ed bf bf = \"\xed\xad\xbf\xed\xbf\xbf\"                           |",
254      "5.2.4  U+DB7F U+DFFF = ed ad bf ed bf bf = \"\"                           |", "\x06"},
255     {"5.2.5  U+DB80 U+DC00 = ed ae 80 ed b0 80 = \"\xed\xae\x80\xed\xb0\x80\"                           |",
256      "5.2.5  U+DB80 U+DC00 = ed ae 80 ed b0 80 = \"\"                           |", "\x06"},
257     {"5.2.6  U+DB80 U+DFFF = ed ae 80 ed bf bf = \"\xed\xae\x80\xed\xbf\xbf\"                           |",
258      "5.2.6  U+DB80 U+DFFF = ed ae 80 ed bf bf = \"\"                           |", "\x06"},
259     {"5.2.7  U+DBFF U+DC00 = ed af bf ed b0 80 = \"\xed\xaf\xbf\xed\xb0\x80\"                           |",
260      "5.2.7  U+DBFF U+DC00 = ed af bf ed b0 80 = \"\"                           |", "\x06"},
261     {"5.2.8  U+DBFF U+DFFF = ed af bf ed bf bf = \"\xed\xaf\xbf\xed\xbf\xbf\"                           |",
262      "5.2.8  U+DBFF U+DFFF = ed af bf ed bf bf = \"\"                           |", "\x06"},
263 //    5.3 Noncharacter code positions
264 //        The following "noncharacters" are "reserved for internal use" by applications, and according to older versions
265 //        of the Unicode Standard "should never be interchanged". Unicode Corrigendum #9 dropped the latter restriction.
266 //        Nevertheless, their presence in incoming UTF-8 data can remain a potential security risk, depending
267 //        on what use is made of these codes subsequently. Examples of such internal use:
268 //          - Some file APIs with 16-bit characters may use the integer value -1 = U+FFFF to signal
269 //            an end-of-file (EOF) or error condition.
270 //          - In some UTF-16 receivers, code point U+FFFE might trigger a byte-swap operation
271 //            (to convert between UTF-16LE and UTF-16BE).
272 //        With such internal use of noncharacters, it may be desirable and safer to block those code points in
273 //        UTF-8 decoders, as they should never occur legitimately in incoming UTF-8 data, and could trigger
274 //        unsafe behaviour in subsequent processing.
275 //
276 //        Particularly problematic noncharacters in 16-bit applications:
277     {"5.3.1  U+FFFE = ef bf be = \"\xef\xbf\xbe\"                                              |",
278      "5.3.1  U+FFFE = ef bf be = \"\"                                              |", "\x03"},
279     {"5.3.2  U+FFFF = ef bf bf = \"\xef\xbf\xbf\"                                              |",
280      "5.3.2  U+FFFF = ef bf bf = \"\"                                              |", "\x03"},
281     /* Fo now, we ignore those, they do not seem to be crucial anyway... */
282 //    5.3.3  U+FDD0 .. U+FDEF
283 //    5.3.4  U+nFFFE U+nFFFF (for n = 1..10)
284     {NULL, NULL, NULL}
285 };
286
287 /* BLI_utf8_invalid_strip (and indirectly, BLI_utf8_invalid_byte). */
288 TEST(string, Utf8InvalidBytes)
289 {
290         for (int i = 0; utf8_invalid_tests[i][0] != NULL; i++) {
291                 const char *tst = utf8_invalid_tests[i][0];
292                 const char *tst_stripped = utf8_invalid_tests[i][1];
293                 const int num_errors = (int)utf8_invalid_tests[i][2][0];
294
295                 char buff[80];
296                 memcpy(buff, tst, sizeof(buff));
297
298                 const int num_errors_found = BLI_utf8_invalid_strip(buff, sizeof(buff) - 1);
299
300                 printf("[%02d] -> [%02d] \"%s\"  ->  \"%s\"\n", num_errors, num_errors_found, tst, buff);
301                 EXPECT_EQ(num_errors_found, num_errors);
302                 EXPECT_STREQ(buff, tst_stripped);
303         }
304 }