svn merge -r39834:40222 https://svn.blender.org/svnroot/bf-blender/trunk/blender
[blender.git] / source / blender / blenlib / intern / string_utf8.c
1 /*
2  * $Id:
3  *
4  * ***** BEGIN GPL LICENSE BLOCK *****
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software Foundation,
18  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19  *
20  * The Original Code is Copyright (C) 2011 Blender Foundation.
21  * All rights reserved.
22  *
23  * Contributor(s): Campbell Barton.
24  *
25  * ***** END GPL LICENSE BLOCK *****
26  * 
27  */
28  
29  /** \file blender/blenlib/intern/string_utf8.c
30  *  \ingroup bli
31  */
32  
33 #include <string.h>
34
35 /* from libswish3, originally called u8_isvalid(),
36  * modified to return the index of the bad character (byte index not utf).
37  * http://svn.swish-e.org/libswish3/trunk/src/libswish3/utf8.c r3044 - campbell */
38
39 /* based on the valid_utf8 routine from the PCRE library by Philip Hazel
40
41    length is in bytes, since without knowing whether the string is valid
42    it's hard to know how many characters there are! */
43
44 static const char trailingBytesForUTF8[256] = {
45         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
46         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
47         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
51         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
52         2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
53 };
54
55 int BLI_utf8_invalid_byte(const char *str, int length)
56 {
57         const unsigned char *p, *pend = (unsigned char*)str + length;
58         unsigned char c;
59         int ab;
60
61         for (p = (unsigned char*)str; p < pend; p++) {
62                 c = *p;
63                 if (c < 128)
64                         continue;
65                 if ((c & 0xc0) != 0xc0)
66                         goto utf8_error;
67                 ab = trailingBytesForUTF8[c];
68                 if (length < ab)
69                         goto utf8_error;
70                 length -= ab;
71
72                 p++;
73                 /* Check top bits in the second byte */
74                 if ((*p & 0xc0) != 0x80)
75                         goto utf8_error;
76
77                 /* Check for overlong sequences for each different length */
78                 switch (ab) {
79                         /* Check for xx00 000x */
80                 case 1:
81                         if ((c & 0x3e) == 0) goto utf8_error;
82                         continue;   /* We know there aren't any more bytes to check */
83
84                         /* Check for 1110 0000, xx0x xxxx */
85                 case 2:
86                         if (c == 0xe0 && (*p & 0x20) == 0) goto utf8_error;
87                         break;
88
89                         /* Check for 1111 0000, xx00 xxxx */
90                 case 3:
91                         if (c == 0xf0 && (*p & 0x30) == 0) goto utf8_error;
92                         break;
93
94                         /* Check for 1111 1000, xx00 0xxx */
95                 case 4:
96                         if (c == 0xf8 && (*p & 0x38) == 0) goto utf8_error;
97                         break;
98
99                         /* Check for leading 0xfe or 0xff,
100                            and then for 1111 1100, xx00 00xx */
101                 case 5:
102                         if (c == 0xfe || c == 0xff ||
103                                 (c == 0xfc && (*p & 0x3c) == 0)) goto utf8_error;
104                         break;
105                 }
106
107                 /* Check for valid bytes after the 2nd, if any; all must start 10 */
108                 while (--ab > 0) {
109                         if ((*(p+1) & 0xc0) != 0x80) goto utf8_error;
110                         p++; /* do this after so we get usable offset - campbell */
111                 }
112         }
113
114         return -1;
115
116 utf8_error:
117
118         return (int)((char *)p - (char *)str) - 1;
119 }
120
121 int BLI_utf8_invalid_strip(char *str, int length)
122 {
123         int bad_char, tot= 0;
124
125         while((bad_char= BLI_utf8_invalid_byte(str, length)) != -1) {
126                 str += bad_char;
127                 length -= bad_char;
128
129                 if(length == 0) {
130                         /* last character bad, strip it */
131                         *str= '\0';
132                         tot++;
133                         break;
134                 }
135                 else {
136                         /* strip, keep looking */
137                         memmove(str, str + 1, length);
138                         tot++;
139                 }
140         }
141
142         return tot;
143 }
144
145
146 /* compatible with BLI_strncpy, but esnure no partial utf8 chars */
147
148 /* array copied from glib's glib's gutf8.c,
149  * note: this looks to be at odd's with 'trailingBytesForUTF8',
150  * need to find out what gives here! - campbell */
151 static const size_t utf8_skip_data[256] = {
152   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
153   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
154   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
155   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
156   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
157   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
158   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
159   3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
160 };
161
162 char *BLI_strncpy_utf8(char *dst, const char *src, size_t maxncpy)
163 {
164         char *dst_r= dst;
165         size_t utf8_size;
166
167         /* note: currently we dont attempt to deal with invalid utf8 chars */
168
169         while(*src != '\0' && (utf8_size= utf8_skip_data[*src]) < maxncpy) {
170                 maxncpy -= utf8_size;
171                 switch(utf8_size) {
172                         case 6: *dst ++ = *src ++;
173                         case 5: *dst ++ = *src ++;
174                         case 4: *dst ++ = *src ++;
175                         case 3: *dst ++ = *src ++;
176                         case 2: *dst ++ = *src ++;
177                         case 1: *dst ++ = *src ++;
178                 }
179         }
180         *dst= '\0';
181         return dst_r;
182 }
183