Merged changes in the trunk up to revision 45619.
[blender.git] / source / blender / blenlib / intern / string_utf8.c
1 /*
2  * ***** BEGIN GPL LICENSE BLOCK *****
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software Foundation,
16  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  *
18  * The Original Code is Copyright (C) 2011 Blender Foundation.
19  * Code from gutf8.c Copyright (C) 1999 Tom Tromey
20  *                   Copyright (C) 2000 Red Hat, Inc.
21  * All rights reserved.
22  *
23  * Contributor(s): Campbell Barton.
24  *
25  * ***** END GPL LICENSE BLOCK *****
26  *
27  */
28
29  /** \file blender/blenlib/intern/string_utf8.c
30   *  \ingroup bli
31   */
32
33 #include <string.h>
34 #include <wchar.h>
35 #include <wctype.h>
36
37 #include "BLI_string_utf8.h"
38
39 /* from libswish3, originally called u8_isvalid(),
40  * modified to return the index of the bad character (byte index not utf).
41  * http://svn.swish-e.org/libswish3/trunk/src/libswish3/utf8.c r3044 - campbell */
42
43 /* based on the valid_utf8 routine from the PCRE library by Philip Hazel
44  *
45  * length is in bytes, since without knowing whether the string is valid
46  * it's hard to know how many characters there are! */
47
48 static const char trailingBytesForUTF8[256] = {
49         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
51         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
52         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
53         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
54         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
55         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
56         2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
57 };
58
59 int BLI_utf8_invalid_byte(const char *str, int length)
60 {
61         const unsigned char *p, *pend = (unsigned char*)str + length;
62         unsigned char c;
63         int ab;
64
65         for (p = (unsigned char*)str; p < pend; p++) {
66                 c = *p;
67                 if (c < 128)
68                         continue;
69                 if ((c & 0xc0) != 0xc0)
70                         goto utf8_error;
71                 ab = trailingBytesForUTF8[c];
72                 if (length < ab)
73                         goto utf8_error;
74                 length -= ab;
75
76                 p++;
77                 /* Check top bits in the second byte */
78                 if ((*p & 0xc0) != 0x80)
79                         goto utf8_error;
80
81                 /* Check for overlong sequences for each different length */
82                 switch (ab) {
83                         /* Check for xx00 000x */
84                 case 1:
85                         if ((c & 0x3e) == 0) goto utf8_error;
86                         continue;   /* We know there aren't any more bytes to check */
87
88                         /* Check for 1110 0000, xx0x xxxx */
89                 case 2:
90                         if (c == 0xe0 && (*p & 0x20) == 0) goto utf8_error;
91                         break;
92
93                         /* Check for 1111 0000, xx00 xxxx */
94                 case 3:
95                         if (c == 0xf0 && (*p & 0x30) == 0) goto utf8_error;
96                         break;
97
98                         /* Check for 1111 1000, xx00 0xxx */
99                 case 4:
100                         if (c == 0xf8 && (*p & 0x38) == 0) goto utf8_error;
101                         break;
102
103                         /* Check for leading 0xfe or 0xff,
104                          * and then for 1111 1100, xx00 00xx */
105                 case 5:
106                         if (c == 0xfe || c == 0xff ||
107                                 (c == 0xfc && (*p & 0x3c) == 0)) goto utf8_error;
108                         break;
109                 }
110
111                 /* Check for valid bytes after the 2nd, if any; all must start 10 */
112                 while (--ab > 0) {
113                         if ((*(p+1) & 0xc0) != 0x80) goto utf8_error;
114                         p++; /* do this after so we get usable offset - campbell */
115                 }
116         }
117
118         return -1;
119
120 utf8_error:
121
122         return (int)((char *)p - (char *)str) - 1;
123 }
124
125 int BLI_utf8_invalid_strip(char *str, int length)
126 {
127         int bad_char, tot= 0;
128
129         while ((bad_char= BLI_utf8_invalid_byte(str, length)) != -1) {
130                 str += bad_char;
131                 length -= bad_char;
132
133                 if (length == 0) {
134                         /* last character bad, strip it */
135                         *str= '\0';
136                         tot++;
137                         break;
138                 }
139                 else {
140                         /* strip, keep looking */
141                         memmove(str, str + 1, length);
142                         tot++;
143                 }
144         }
145
146         return tot;
147 }
148
149
150 /* compatible with BLI_strncpy, but esnure no partial utf8 chars */
151
152 /* array copied from glib's gutf8.c,
153  * note: this looks to be at odd's with 'trailingBytesForUTF8',
154  * need to find out what gives here! - campbell */
155 static const size_t utf8_skip_data[256] = {
156     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
157     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
158     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
159     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
160     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
161     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
162     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
163     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
164 };
165
166 #define BLI_STR_UTF8_CPY(dst, src, maxncpy)                                   \
167         {                                                                         \
168                 size_t utf8_size;                                                     \
169                 while (*src != '\0' && (utf8_size= utf8_skip_data[*src]) < maxncpy) {  \
170                         maxncpy -= utf8_size;                                             \
171                         switch(utf8_size) {                                               \
172                                 case 6: *dst ++ = *src ++;                                    \
173                                 case 5: *dst ++ = *src ++;                                    \
174                                 case 4: *dst ++ = *src ++;                                    \
175                                 case 3: *dst ++ = *src ++;                                    \
176                                 case 2: *dst ++ = *src ++;                                    \
177                                 case 1: *dst ++ = *src ++;                                    \
178                         }                                                                 \
179                 }                                                                     \
180                 *dst= '\0';                                                           \
181         }
182
183 char *BLI_strncpy_utf8(char *dst, const char *src, size_t maxncpy)
184 {
185         char *dst_r= dst;
186
187         /* note: currently we don't attempt to deal with invalid utf8 chars */
188         BLI_STR_UTF8_CPY(dst, src, maxncpy)
189
190         return dst_r;
191 }
192
193 char *BLI_strncat_utf8(char *dst, const char *src, size_t maxncpy)
194 {
195         while (*dst && maxncpy > 0) {
196                 dst++;
197                 maxncpy--;
198         }
199
200         BLI_STR_UTF8_CPY(dst, src, maxncpy)
201
202         return dst;
203 }
204
205 #undef BLI_STR_UTF8_CPY
206
207 /* --------------------------------------------------------------------------*/
208 /* wchar_t / utf8 functions  */
209
210 size_t BLI_strncpy_wchar_as_utf8(char *dst, const wchar_t *src, const size_t maxcpy)
211 {
212         size_t len = 0;
213         while (*src && len < maxcpy) { /* XXX can still run over the buffer because utf8 size isn't known :| */
214                 len += BLI_str_utf8_from_unicode(*src++, dst+len);
215         }
216
217         dst[len]= '\0';
218
219         return len;
220 }
221
222 /* wchar len in utf8 */
223 size_t BLI_wstrlen_utf8(const wchar_t *src)
224 {
225         size_t len = 0;
226
227         while (*src) {
228                 len += BLI_str_utf8_from_unicode(*src++, NULL);
229         }
230
231         return len;
232 }
233
234 // utf8slen
235 size_t BLI_strlen_utf8(const char *strc)
236 {
237         int len=0;
238
239         while (*strc) {
240                 if ((*strc & 0xe0) == 0xc0) {
241                         if ((strc[1] & 0x80) && (strc[1] & 0x40) == 0x00)
242                                 strc++;
243                 }
244                 else if ((*strc & 0xf0) == 0xe0) {
245                         if ((strc[1] & strc[2] & 0x80) && ((strc[1] | strc[2]) & 0x40) == 0x00)
246                                 strc += 2;
247                 }
248                 else if ((*strc & 0xf8) == 0xf0) {
249                         if ((strc[1] & strc[2] & strc[3] & 0x80) && ((strc[1] | strc[2] | strc[3]) & 0x40) == 0x00)
250                                 strc += 3;
251                 }
252
253                 strc++;
254                 len++;
255         }
256
257         return len;
258 }
259
260 size_t BLI_strncpy_wchar_from_utf8(wchar_t *dst_w, const char *src_c, const size_t maxcpy)
261 {
262         int len=0;
263
264         if (dst_w==NULL || src_c==NULL) return(0);
265
266         while (*src_c && len < maxcpy) {
267                 size_t step= 0;
268                 unsigned int unicode= BLI_str_utf8_as_unicode_and_size(src_c, &step);
269                 if (unicode != BLI_UTF8_ERR) {
270                         *dst_w= (wchar_t)unicode;
271                         src_c += step;
272                 }
273                 else {
274                         *dst_w = '?';
275                         src_c= BLI_str_find_next_char_utf8(src_c, NULL);
276                 }
277                 dst_w++;
278                 len++;
279         }
280         return len;
281 }
282
283 /* end wchar_t / utf8 functions  */
284 /* --------------------------------------------------------------------------*/
285
286 /* copied from glib's gutf8.c */
287
288 /* note, glib uses unsigned int for unicode, best we do the same,
289  * though we don't typedef it - campbell */
290
291 #define UTF8_COMPUTE(Char, Mask, Len)                                         \
292         if (Char < 128) {                                                         \
293                 Len = 1;                                                              \
294                 Mask = 0x7f;                                                          \
295         }                                                                         \
296         else if ((Char & 0xe0) == 0xc0) {                                         \
297                 Len = 2;                                                              \
298                 Mask = 0x1f;                                                          \
299         }                                                                         \
300         else if ((Char & 0xf0) == 0xe0) {                                         \
301                 Len = 3;                                                              \
302                 Mask = 0x0f;                                                          \
303         }                                                                         \
304         else if ((Char & 0xf8) == 0xf0) {                                         \
305                 Len = 4;                                                              \
306                 Mask = 0x07;                                                          \
307         }                                                                         \
308         else if ((Char & 0xfc) == 0xf8) {                                         \
309                 Len = 5;                                                              \
310                 Mask = 0x03;                                                          \
311         }                                                                         \
312         else if ((Char & 0xfe) == 0xfc) {                                         \
313                 Len = 6;                                                              \
314                 Mask = 0x01;                                                          \
315         }                                                                         \
316         else {                                                                    \
317                 Len = -1;                                                             \
318         }
319
320 /* same as glib define but added an 'Err' arg */
321 #define UTF8_GET(Result, Chars, Count, Mask, Len, Err)                        \
322         (Result) = (Chars)[0] & (Mask);                                           \
323         for ((Count) = 1; (Count) < (Len); ++(Count)) {                           \
324                 if (((Chars)[(Count)] & 0xc0) != 0x80) {                              \
325                         (Result) = Err;                                                   \
326                         break;                                                            \
327                 }                                                                     \
328                 (Result) <<= 6;                                                       \
329                 (Result) |= ((Chars)[(Count)] & 0x3f);                                \
330         }
331
332
333 /* uses glib functions but not from glib */
334 /* gets the size of a single utf8 char */
335 int BLI_str_utf8_size(const char *p)
336 {
337         int mask = 0, len;
338     unsigned char c = (unsigned char) *p;
339
340     UTF8_COMPUTE (c, mask, len);
341
342         (void)mask; /* quiet warning */
343
344         return len;
345 }
346
347 /* was g_utf8_get_char */
348 /**
349  * BLI_str_utf8_as_unicode:
350  * @p a pointer to Unicode character encoded as UTF-8
351  *
352  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
353  * If @p does not point to a valid UTF-8 encoded character, results are
354  * undefined. If you are not sure that the bytes are complete
355  * valid Unicode characters, you should use g_utf8_get_char_validated()
356  * instead.
357  *
358  * Return value: the resulting character
359  **/
360 unsigned int BLI_str_utf8_as_unicode(const char *p)
361 {
362         int i, mask = 0, len;
363         unsigned int result;
364         unsigned char c = (unsigned char) *p;
365
366         UTF8_COMPUTE (c, mask, len);
367         if (len == -1)
368                 return BLI_UTF8_ERR;
369         UTF8_GET (result, p, i, mask, len, BLI_UTF8_ERR);
370
371         return result;
372 }
373
374 /* variant that increments the length */
375 unsigned int BLI_str_utf8_as_unicode_and_size(const char *p, size_t *index)
376 {
377         int i, mask = 0, len;
378         unsigned int result;
379         unsigned char c = (unsigned char) *p;
380
381         UTF8_COMPUTE (c, mask, len);
382         if (len == -1)
383                 return BLI_UTF8_ERR;
384         UTF8_GET (result, p, i, mask, len, BLI_UTF8_ERR);
385         *index += len;
386         return result;
387 }
388
389 /* another variant that steps over the index,
390  * note, currently this also falls back to latin1 for text drawing. */
391 unsigned int BLI_str_utf8_as_unicode_step(const char *p, size_t *index)
392 {
393         int i, mask = 0, len;
394         unsigned int result;
395         unsigned char c;
396
397         p += *index;
398         c= (unsigned char) *p;
399
400         UTF8_COMPUTE (c, mask, len);
401         if (len == -1) {
402                 /* when called with NULL end, result will never be NULL,
403                  * checks for a NULL character */
404                 char *p_next= BLI_str_find_next_char_utf8(p, NULL);
405                 /* will never return the same pointer unless '\0',
406                  * eternal loop is prevented */
407                 *index += (size_t)(p_next - p);
408                 return BLI_UTF8_ERR;
409         }
410
411         /* this is tricky since there are a few ways we can bail out of bad unicode
412          * values, 3 possible solutions. */
413 #if 0
414         UTF8_GET (result, p, i, mask, len, BLI_UTF8_ERR);
415 #elif 1
416         /* WARNING: this is NOT part of glib, or supported by similar functions.
417          * this is added for text drawing because some filepaths can have latin1
418          * characters */
419         UTF8_GET (result, p, i, mask, len, BLI_UTF8_ERR);
420         if (result == BLI_UTF8_ERR) {
421                 len= 1;
422                 result= *p;
423         }
424         /* end warning! */
425 #else
426         /* without a fallback like '?', text drawing will stop on this value */
427         UTF8_GET (result, p, i, mask, len, '?');
428 #endif
429
430         *index += len;
431         return result;
432 }
433
434 /* was g_unichar_to_utf8 */
435 /**
436  * BLI_str_utf8_from_unicode:
437  * @c a Unicode character code
438  * \param outbuf output buffer, must have at least 6 bytes of space.
439  *       If %NULL, the length will be computed and returned
440  *       and nothing will be written to outbuf.
441  *
442  * Converts a single character to UTF-8.
443  *
444  * Return value: number of bytes written
445  **/
446 size_t BLI_str_utf8_from_unicode(unsigned int c, char *outbuf)
447 {
448         /* If this gets modified, also update the copy in g_string_insert_unichar() */
449         unsigned int len = 0;
450         int first;
451         int i;
452
453         if (c < 0x80) {
454                 first = 0;
455                 len = 1;
456         }
457         else if (c < 0x800) {
458                 first = 0xc0;
459                 len = 2;
460         }
461         else if (c < 0x10000) {
462                 first = 0xe0;
463                 len = 3;
464         }
465         else if (c < 0x200000) {
466                 first = 0xf0;
467                 len = 4;
468         }
469         else if (c < 0x4000000) {
470                 first = 0xf8;
471                 len = 5;
472         }
473         else {
474                 first = 0xfc;
475                 len = 6;
476         }
477
478         if (outbuf) {
479                 for (i = len - 1; i > 0; --i) {
480                         outbuf[i] = (c & 0x3f) | 0x80;
481                         c >>= 6;
482                 }
483                 outbuf[0] = c | first;
484         }
485
486         return len;
487 }
488
489 /* was g_utf8_find_prev_char */
490 /**
491  * BLI_str_find_prev_char_utf8:
492  * @str: pointer to the beginning of a UTF-8 encoded string
493  * @p pointer to some position within @str
494  *
495  * Given a position @p with a UTF-8 encoded string @str, find the start
496  * of the previous UTF-8 character starting before. @p Returns %NULL if no
497  * UTF-8 characters are present in @str before @p
498  *
499  * @p does not have to be at the beginning of a UTF-8 character. No check
500  * is made to see if the character found is actually valid other than
501  * it starts with an appropriate byte.
502  *
503  * Return value: a pointer to the found character or %NULL.
504  **/
505 char * BLI_str_find_prev_char_utf8(const char *str, const char *p)
506 {
507         for (--p; p >= str; --p) {
508                 if ((*p & 0xc0) != 0x80) {
509                         return (char *)p;
510                 }
511         }
512         return NULL;
513 }
514
515 /* was g_utf8_find_next_char */
516 /**
517  * BLI_str_find_next_char_utf8:
518  * @p a pointer to a position within a UTF-8 encoded string
519  * @end a pointer to the byte following the end of the string,
520  * or %NULL to indicate that the string is nul-terminated.
521  *
522  * Finds the start of the next UTF-8 character in the string after @p
523  *
524  * @p does not have to be at the beginning of a UTF-8 character. No check
525  * is made to see if the character found is actually valid other than
526  * it starts with an appropriate byte.
527  *
528  * Return value: a pointer to the found character or %NULL
529  **/
530 char *BLI_str_find_next_char_utf8(const char *p, const char *end)
531 {
532         if (*p) {
533                 if (end) {
534                         for (++p; p < end && (*p & 0xc0) == 0x80; ++p) {
535                                 /* do nothing */
536                         }
537                 }
538                 else {
539                         for (++p; (*p & 0xc0) == 0x80; ++p) {
540                                 /* do nothing */
541                         }
542                 }
543         }
544         return (p == end) ? NULL : (char *)p;
545 }
546
547 /* was g_utf8_prev_char */
548 /**
549  * BLI_str_prev_char_utf8:
550  * @p a pointer to a position within a UTF-8 encoded string
551  *
552  * Finds the previous UTF-8 character in the string before @p
553  *
554  * @p does not have to be at the beginning of a UTF-8 character. No check
555  * is made to see if the character found is actually valid other than
556  * it starts with an appropriate byte. If @p might be the first
557  * character of the string, you must use g_utf8_find_prev_char() instead.
558  *
559  * Return value: a pointer to the found character.
560  **/
561 char *BLI_str_prev_char_utf8(const char *p)
562 {
563         while (1) {
564                 p--;
565                 if ((*p & 0xc0) != 0x80) {
566                         return (char *)p;
567                 }
568         }
569 }
570 /* end glib copy */