/[svn]/php/php-src/branches/PHP_5_3/ext/intl/grapheme/grapheme_string.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 306449 - (hide annotations)
Sun Dec 19 05:07:31 2010 UTC (10 years, 2 months ago) by stas
File MIME type: text/x-c
File size: 24432 byte(s)
Fix bad args to grapheme_extract(), reported by Maksymilian Arciemowicz

1 stas 262229 /*
2     +----------------------------------------------------------------------+
3     | PHP Version 5 |
4     +----------------------------------------------------------------------+
5     | This source file is subject to version 3.01 of the PHP license, |
6     | that is bundled with this package in the file LICENSE, and is |
7     | available through the world-wide-web at the following url: |
8     | http://www.php.net/license/3_01.txt |
9     | If you did not receive a copy of the PHP license and are unable to |
10     | obtain it through the world-wide-web, please send a note to |
11     | license@php.net so we can mail you a copy immediately. |
12     +----------------------------------------------------------------------+
13     | Author: Ed Batutis <ed@batutis.com> |
14     +----------------------------------------------------------------------+
15     */
16    
17     /* {{{ includes */
18     #ifdef HAVE_CONFIG_H
19     #include "config.h"
20     #endif
21    
22     #include <php.h>
23     #include "grapheme.h"
24     #include "grapheme_util.h"
25    
26     #include <unicode/utypes.h>
27     #include <unicode/ucol.h>
28     #include <unicode/ustring.h>
29     #include <unicode/ubrk.h>
30    
31     #include "ext/standard/php_string.h"
32    
33     /* }}} */
34    
35     #define GRAPHEME_EXTRACT_TYPE_COUNT 0
36     #define GRAPHEME_EXTRACT_TYPE_MAXBYTES 1
37     #define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2
38     #define GRAPHEME_EXTRACT_TYPE_MIN GRAPHEME_EXTRACT_TYPE_COUNT
39     #define GRAPHEME_EXTRACT_TYPE_MAX GRAPHEME_EXTRACT_TYPE_MAXCHARS
40    
41    
42     /* {{{ grapheme_register_constants
43     * Register API constants
44     */
45     void grapheme_register_constants( INIT_FUNC_ARGS )
46     {
47     REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
48     REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
49     REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
50     }
51     /* }}} */
52    
53     /* {{{ proto int grapheme_strlen(string str)
54     Get number of graphemes in a string */
55     PHP_FUNCTION(grapheme_strlen)
56     {
57     unsigned char* string;
58     int string_len;
59     UChar* ustring = NULL;
60     int ustring_len = 0;
61     int ret_len;
62     UErrorCode status;
63    
64     if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", (char **)&string, &string_len) == FAILURE) {
65    
66     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
67     "grapheme_strlen: unable to parse input param", 0 TSRMLS_CC );
68    
69     RETURN_FALSE;
70     }
71    
72     ret_len = grapheme_ascii_check(string, string_len);
73    
74     if ( ret_len >= 0 )
75     RETURN_LONG(ret_len);
76    
77     /* convert the string to UTF-16. */
78     status = U_ZERO_ERROR;
79     intl_convert_utf8_to_utf16(&ustring, &ustring_len, (char*) string, string_len, &status );
80    
81     if ( U_FAILURE( status ) ) {
82     /* Set global error code. */
83     intl_error_set_code( NULL, status TSRMLS_CC );
84    
85     /* Set error messages. */
86 stas 292566 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
87 stas 262229 efree( ustring );
88     RETURN_NULL();
89     }
90    
91     ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 TSRMLS_CC );
92    
93     efree( ustring );
94    
95     if (ret_len >= 0) {
96     RETVAL_LONG(ret_len);
97     } else {
98     RETVAL_FALSE;
99     }
100     }
101     /* }}} */
102    
103     /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
104     Find position of first occurrence of a string within another */
105     PHP_FUNCTION(grapheme_strpos)
106     {
107     unsigned char *haystack, *needle;
108     int haystack_len, needle_len;
109     unsigned char *found;
110     long loffset = 0;
111     int32_t offset = 0;
112     int ret_pos, uchar_pos;
113    
114     if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
115    
116     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
117     "grapheme_strpos: unable to parse input param", 0 TSRMLS_CC );
118    
119     RETURN_FALSE;
120     }
121    
122     if ( OUTSIDE_STRING(loffset, haystack_len) ) {
123    
124     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
125    
126     RETURN_FALSE;
127     }
128    
129     /* we checked that it will fit: */
130     offset = (int32_t) loffset;
131    
132     /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
133    
134     if (needle_len == 0) {
135    
136     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
137    
138     RETURN_FALSE;
139     }
140    
141    
142     /* quick check to see if the string might be there
143     * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
144     */
145     found = (unsigned char *)php_memnstr((char *)haystack + offset, (char *)needle, needle_len, (char *)haystack + haystack_len);
146    
147     /* if it isn't there the we are done */
148     if (!found) {
149     RETURN_FALSE;
150     }
151    
152     /* if it is there, and if the haystack is ascii, we are all done */
153     if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
154    
155     RETURN_LONG(found - haystack);
156     }
157    
158     /* do utf16 part of the strpos */
159     ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 0 /* fIgnoreCase */ TSRMLS_CC );
160    
161     if ( ret_pos >= 0 ) {
162     RETURN_LONG(ret_pos + offset);
163     } else {
164     RETURN_FALSE;
165     }
166    
167     }
168     /* }}} */
169    
170     /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
171     Find position of first occurrence of a string within another, ignoring case differences */
172     PHP_FUNCTION(grapheme_stripos)
173     {
174     unsigned char *haystack, *needle, *haystack_dup, *needle_dup;
175     int haystack_len, needle_len;
176     unsigned char *found;
177     long loffset = 0;
178     int32_t offset = 0;
179     int ret_pos, uchar_pos;
180     int is_ascii;
181    
182     if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
183    
184     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
185     "grapheme_stripos: unable to parse input param", 0 TSRMLS_CC );
186    
187     RETURN_FALSE;
188     }
189    
190     if ( OUTSIDE_STRING(loffset, haystack_len) ) {
191    
192     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 TSRMLS_CC );
193    
194     RETURN_FALSE;
195     }
196    
197     /* we checked that it will fit: */
198     offset = (int32_t) loffset;
199    
200     /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
201    
202     if (needle_len == 0) {
203    
204     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 TSRMLS_CC );
205    
206     RETURN_FALSE;
207     }
208    
209    
210     is_ascii = ( grapheme_ascii_check(haystack, haystack_len) >= 0 );
211    
212     if ( is_ascii ) {
213     needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
214     php_strtolower((char *)needle_dup, needle_len);
215     haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
216     php_strtolower((char *)haystack_dup, haystack_len);
217    
218     found = (unsigned char*) php_memnstr((char *)haystack_dup + offset, (char *)needle_dup, needle_len, (char *)haystack_dup + haystack_len);
219    
220     efree(haystack_dup);
221     efree(needle_dup);
222    
223     if (found) {
224     RETURN_LONG(found - haystack_dup);
225     }
226    
227     /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
228     if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
229     RETURN_FALSE;
230     }
231     }
232    
233     /* do utf16 part of the strpos */
234     ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 1 /* fIgnoreCase */ TSRMLS_CC );
235    
236     if ( ret_pos >= 0 ) {
237     RETURN_LONG(ret_pos + offset);
238     } else {
239     RETURN_FALSE;
240     }
241    
242     }
243     /* }}} */
244    
245     /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
246     Find position of last occurrence of a string within another */
247     PHP_FUNCTION(grapheme_strrpos)
248     {
249     unsigned char *haystack, *needle;
250     int haystack_len, needle_len;
251     long loffset = 0;
252     int32_t offset = 0;
253     int32_t ret_pos;
254     int is_ascii;
255    
256     if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
257    
258     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
259     "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
260    
261     RETURN_FALSE;
262     }
263    
264     if ( OUTSIDE_STRING(loffset, haystack_len) ) {
265    
266     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
267    
268     RETURN_FALSE;
269     }
270    
271     /* we checked that it will fit: */
272     offset = (int32_t) loffset;
273    
274     /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
275    
276     if (needle_len == 0) {
277    
278     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
279    
280     RETURN_FALSE;
281     }
282    
283     is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
284    
285     if ( is_ascii ) {
286    
287     ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
288    
289    
290     if ( ret_pos >= 0 ) {
291     RETURN_LONG(ret_pos);
292     }
293    
294     /* if the needle was ascii too, we are done */
295    
296     if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
297     RETURN_FALSE;
298     }
299    
300     /* else we need to continue via utf16 */
301     }
302    
303     ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 0 /* f_ignore_case */ TSRMLS_CC);
304    
305     if ( ret_pos >= 0 ) {
306     RETURN_LONG(ret_pos);
307     } else {
308     RETURN_FALSE;
309     }
310    
311    
312     }
313     /* }}} */
314    
315     /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
316     Find position of last occurrence of a string within another, ignoring case */
317     PHP_FUNCTION(grapheme_strripos)
318     {
319     unsigned char *haystack, *needle;
320     int haystack_len, needle_len;
321     long loffset = 0;
322     int32_t offset = 0;
323     int32_t ret_pos;
324     int is_ascii;
325    
326     if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
327    
328     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
329     "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
330    
331     RETURN_FALSE;
332     }
333    
334     if ( OUTSIDE_STRING(loffset, haystack_len) ) {
335    
336     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
337    
338     RETURN_FALSE;
339     }
340    
341     /* we checked that it will fit: */
342     offset = (int32_t) loffset;
343    
344     /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
345    
346     if (needle_len == 0) {
347    
348     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
349    
350     RETURN_FALSE;
351     }
352    
353     is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
354    
355     if ( is_ascii ) {
356     unsigned char *needle_dup, *haystack_dup;
357    
358     needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
359     php_strtolower((char *)needle_dup, needle_len);
360     haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
361     php_strtolower((char *)haystack_dup, haystack_len);
362    
363     ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
364    
365     efree(haystack_dup);
366     efree(needle_dup);
367    
368     if ( ret_pos >= 0 ) {
369     RETURN_LONG(ret_pos);
370     }
371    
372     /* if the needle was ascii too, we are done */
373    
374     if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
375     RETURN_FALSE;
376     }
377    
378     /* else we need to continue via utf16 */
379     }
380    
381     ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 1 /* f_ignore_case */ TSRMLS_CC);
382    
383     if ( ret_pos >= 0 ) {
384     RETURN_LONG(ret_pos);
385     } else {
386     RETURN_FALSE;
387     }
388    
389    
390     }
391     /* }}} */
392    
393     /* {{{ proto string grapheme_substr(string str, int start [, int length])
394     Returns part of a string */
395     PHP_FUNCTION(grapheme_substr)
396     {
397     unsigned char *str, *sub_str;
398     UChar *ustr;
399     int str_len, sub_str_len, ustr_len;
400     long lstart = 0, length = 0;
401     int32_t start = 0;
402     int iter_val;
403     UErrorCode status;
404     unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
405     UBreakIterator* bi = NULL;
406     int sub_str_start_pos, sub_str_end_pos;
407     int32_t (*iter_func)(UBreakIterator *);
408    
409     if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|l", (char **)&str, &str_len, &lstart, &length) == FAILURE) {
410    
411     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
412     "grapheme_substr: unable to parse input param", 0 TSRMLS_CC );
413    
414     RETURN_FALSE;
415     }
416    
417     if ( OUTSIDE_STRING(lstart, str_len) ) {
418    
419     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
420    
421     RETURN_FALSE;
422     }
423    
424     /* we checked that it will fit: */
425     start = (int32_t) lstart;
426    
427     /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
428    
429     if ( grapheme_ascii_check(str, str_len) >= 0 ) {
430     grapheme_substr_ascii((char *)str, str_len, start, length, ZEND_NUM_ARGS(), (char **) &sub_str, &sub_str_len);
431    
432     if ( NULL == sub_str ) {
433     RETURN_FALSE;
434     }
435    
436     RETURN_STRINGL(((char *)sub_str), sub_str_len, 1);
437     }
438    
439     ustr = NULL;
440     ustr_len = 0;
441     status = U_ZERO_ERROR;
442     intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)str, str_len, &status);
443    
444     if ( U_FAILURE( status ) ) {
445     /* Set global error code. */
446     intl_error_set_code( NULL, status TSRMLS_CC );
447    
448     /* Set error messages. */
449 stas 292566 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
450 stas 262229 efree( ustr );
451     RETURN_FALSE;
452     }
453    
454     bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
455    
456     if( U_FAILURE(status) ) {
457     RETURN_FALSE;
458     }
459    
460     ubrk_setText(bi, ustr, ustr_len, &status);
461    
462     if ( start < 0 ) {
463     iter_func = ubrk_previous;
464     ubrk_last(bi);
465     iter_val = 1;
466     }
467     else {
468     iter_func = ubrk_next;
469     iter_val = -1;
470     }
471    
472     sub_str_start_pos = 0;
473    
474     while ( start ) {
475     sub_str_start_pos = iter_func(bi);
476    
477     if ( UBRK_DONE == sub_str_start_pos ) {
478     break;
479     }
480    
481     start += iter_val;
482     }
483    
484     if ( 0 != start || sub_str_start_pos >= ustr_len ) {
485    
486     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
487    
488     efree(ustr);
489     ubrk_close(bi);
490     RETURN_FALSE;
491     }
492    
493     if (ZEND_NUM_ARGS() <= 2) {
494    
495     /* no length supplied, return the rest of the string */
496    
497     sub_str = NULL;
498     sub_str_len = 0;
499     status = U_ZERO_ERROR;
500     intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
501    
502     efree( ustr );
503     ubrk_close( bi );
504    
505     if ( U_FAILURE( status ) ) {
506     /* Set global error code. */
507     intl_error_set_code( NULL, status TSRMLS_CC );
508    
509     /* Set error messages. */
510 stas 292566 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
511 stas 262229
512     efree( sub_str );
513    
514     RETURN_FALSE;
515     }
516    
517     /* return the allocated string, not a duplicate */
518     RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
519     }
520    
521     /* find the end point of the string to return */
522    
523     if ( length < 0 ) {
524     iter_func = ubrk_previous;
525     ubrk_last(bi);
526     iter_val = 1;
527     }
528     else {
529     iter_func = ubrk_next;
530     iter_val = -1;
531     }
532    
533     sub_str_end_pos = 0;
534    
535     while ( length ) {
536     sub_str_end_pos = iter_func(bi);
537    
538     if ( UBRK_DONE == sub_str_end_pos ) {
539     break;
540     }
541    
542     length += iter_val;
543     }
544    
545 stas 278544 if ( UBRK_DONE == sub_str_end_pos && length < 0) {
546 stas 262229
547     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 TSRMLS_CC );
548    
549     efree(ustr);
550     ubrk_close(bi);
551     RETURN_FALSE;
552     }
553    
554     sub_str = NULL;
555     status = U_ZERO_ERROR;
556     intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
557    
558     efree( ustr );
559     ubrk_close( bi );
560    
561     if ( U_FAILURE( status ) ) {
562     /* Set global error code. */
563     intl_error_set_code( NULL, status TSRMLS_CC );
564    
565     /* Set error messages. */
566 stas 292566 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
567 stas 262229
568     if ( NULL != sub_str )
569     efree( sub_str );
570    
571     RETURN_FALSE;
572     }
573    
574     /* return the allocated string, not a duplicate */
575     RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
576    
577     }
578     /* }}} */
579    
580     /* {{{ strstr_common_handler */
581     static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
582     {
583     unsigned char *haystack, *needle, *found;
584     int haystack_len, needle_len;
585     int ret_pos, uchar_pos;
586     zend_bool part = 0;
587    
588     if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &part) == FAILURE) {
589    
590     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
591     "grapheme_strstr: unable to parse input param", 0 TSRMLS_CC );
592    
593     RETURN_FALSE;
594     }
595    
596     if (needle_len == 0) {
597    
598     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
599    
600     RETURN_FALSE;
601     }
602    
603    
604     if ( !f_ignore_case ) {
605    
606     /* ASCII optimization: quick check to see if the string might be there
607     * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
608     */
609     found = (unsigned char *)php_memnstr((char *)haystack, (char *)needle, needle_len, (char *)haystack + haystack_len);
610    
611     /* if it isn't there the we are done */
612     if ( !found ) {
613     RETURN_FALSE;
614     }
615    
616     /* if it is there, and if the haystack is ascii, we are all done */
617     if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
618     size_t found_offset = found - haystack;
619    
620     if (part) {
621     RETURN_STRINGL(((char *)haystack) , found_offset, 1);
622     } else {
623     RETURN_STRINGL(((char *)found), haystack_len - found_offset, 1);
624     }
625     }
626    
627     }
628    
629     /* need to work in utf16 */
630     ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case TSRMLS_CC );
631    
632     if ( ret_pos < 0 ) {
633     RETURN_FALSE;
634     }
635    
636     /* uchar_pos is the 'nth' Unicode character position of the needle */
637    
638     ret_pos = 0;
639     U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
640    
641     if (part) {
642     RETURN_STRINGL(((char *)haystack), ret_pos, 1);
643     }
644     else {
645     RETURN_STRINGL(((char *)haystack) + ret_pos, haystack_len - ret_pos, 1);
646     }
647    
648     }
649     /* }}} */
650    
651     /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
652     Finds first occurrence of a string within another */
653     PHP_FUNCTION(grapheme_strstr)
654     {
655     strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
656     }
657     /* }}} */
658    
659     /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
660     Finds first occurrence of a string within another */
661     PHP_FUNCTION(grapheme_stristr)
662     {
663     strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
664     }
665     /* }}} */
666    
667     /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
668     inline int32_t
669     grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
670     {
671     int pos = 0, prev_pos = 0;
672     int ret_pos = 0, prev_ret_pos = 0;
673    
674     while ( 1 ) {
675     pos = ubrk_next(bi);
676    
677     if ( UBRK_DONE == pos ) {
678     break;
679     }
680    
681     /* if we are beyond our limit, then the loop is done */
682     if ( pos > csize ) {
683     break;
684     }
685    
686     /* update our pointer in the original UTF-8 buffer by as many characters
687     as ubrk_next iterated over */
688    
689     prev_ret_pos = ret_pos;
690     U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
691    
692     if ( prev_ret_pos == ret_pos ) {
693     /* something wrong - malformed utf8? */
694     break;
695     }
696    
697     prev_pos = pos;
698     }
699    
700     return ret_pos;
701     }
702     /* }}} */
703    
704     /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
705     inline int32_t
706     grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
707     {
708     int pos = 0, prev_pos = 0;
709     int ret_pos = 0, prev_ret_pos = 0;
710    
711     while ( 1 ) {
712     pos = ubrk_next(bi);
713    
714     if ( UBRK_DONE == pos ) {
715     break;
716     }
717    
718     prev_ret_pos = ret_pos;
719     U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
720    
721     if ( ret_pos > bsize ) {
722     ret_pos = prev_ret_pos;
723     break;
724     }
725    
726     if ( prev_ret_pos == ret_pos ) {
727     /* something wrong - malformed utf8? */
728     break;
729     }
730    
731     prev_pos = pos;
732     }
733    
734     return ret_pos;
735     }
736     /* }}} */
737    
738     /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
739     inline int32_t
740     grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
741     {
742     int pos = 0, next_pos = 0;
743     int ret_pos = 0;
744    
745     while ( size ) {
746     next_pos = ubrk_next(bi);
747    
748     if ( UBRK_DONE == next_pos ) {
749     break;
750     }
751     pos = next_pos;
752     size--;
753     }
754    
755     /* pos is one past the last UChar - and represent the number of code units to
756     advance in the utf-8 buffer
757     */
758    
759     U8_FWD_N(pstr, ret_pos, str_len, pos);
760    
761     return ret_pos;
762     }
763     /* }}} */
764    
765     /* {{{ grapheme extract iter function pointer array */
766     typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
767    
768     static grapheme_extract_iter grapheme_extract_iters[] = {
769     &grapheme_extract_count_iter,
770     &grapheme_extract_bytecount_iter,
771     &grapheme_extract_charcount_iter,
772     };
773     /* }}} */
774    
775     /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
776     Function to extract a sequence of default grapheme clusters */
777     PHP_FUNCTION(grapheme_extract)
778     {
779     unsigned char *str, *pstr;
780     UChar *ustr;
781     int str_len, ustr_len;
782     long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
783     long lstart = 0; /* starting position in str in bytes */
784     int32_t start = 0;
785     long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
786     UErrorCode status;
787     unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
788     UBreakIterator* bi = NULL;
789     int ret_pos;
790 felipe 264640 zval *next = NULL; /* return offset of next part of the string */
791 stas 262229
792     if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|llz", (char **)&str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
793    
794     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
795     "grapheme_extract: unable to parse input param", 0 TSRMLS_CC );
796    
797     RETURN_FALSE;
798     }
799    
800     if ( NULL != next ) {
801     if ( !PZVAL_IS_REF(next) ) {
802 stas 306449 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
803 stas 262229 "grapheme_extract: 'next' was not passed by reference", 0 TSRMLS_CC );
804    
805     RETURN_FALSE;
806     }
807     else {
808     /* initialize next */
809     ZVAL_LONG(next, start);
810     }
811     }
812    
813     if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
814    
815     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
816     "grapheme_extract: unknown extract type param", 0 TSRMLS_CC );
817    
818     RETURN_FALSE;
819     }
820    
821     if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
822 stas 306449 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 TSRMLS_CC );
823     RETURN_FALSE;
824     }
825 stas 262229
826 stas 306449 if ( size > INT32_MAX || size < 0) {
827     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 TSRMLS_CC );
828 stas 262229 RETURN_FALSE;
829     }
830 stas 306449 if (size == 0) {
831     RETURN_EMPTY_STRING();
832     }
833 stas 262229
834     /* we checked that it will fit: */
835     start = (int32_t) lstart;
836    
837     pstr = str + start;
838    
839     /* just in case pstr points in the middle of a character, move forward to the start of the next char */
840     if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
841     unsigned char *str_end = str + str_len;
842    
843     while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
844     pstr++;
845     if ( pstr >= str_end ) {
846     intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
847     "grapheme_extract: invalid input string", 0 TSRMLS_CC );
848    
849     RETURN_FALSE;
850     }
851     }
852     }
853    
854     str_len -= (pstr - str);
855    
856     /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
857     (size + 1 because the size-th character might be the beginning of a grapheme cluster)
858     */
859    
860     if ( -1 != grapheme_ascii_check(pstr, size + 1 < str_len ? size + 1 : str_len ) ) {
861 kirtig 279457 long nsize = ( size < str_len ? size : str_len );
862 stas 262229 if ( NULL != next ) {
863 kirtig 279457 ZVAL_LONG(next, start+nsize);
864 stas 262229 }
865 kirtig 279457 RETURN_STRINGL(((char *)pstr), nsize, 1);
866 stas 262229 }
867    
868     /* convert the strings to UTF-16. */
869     ustr = NULL;
870     ustr_len = 0;
871     status = U_ZERO_ERROR;
872     intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status );
873    
874     if ( U_FAILURE( status ) ) {
875     /* Set global error code. */
876     intl_error_set_code( NULL, status TSRMLS_CC );
877    
878     /* Set error messages. */
879 stas 292566 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
880 stas 262229
881     if ( NULL != ustr )
882     efree( ustr );
883    
884     RETURN_FALSE;
885     }
886    
887     bi = NULL;
888     status = U_ZERO_ERROR;
889     bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
890    
891     ubrk_setText(bi, ustr, ustr_len, &status);
892    
893     /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
894     can't back up. So, we will not do anything. */
895    
896     /* now we need to find the end of the chunk the user wants us to return */
897    
898     ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
899    
900     efree(ustr);
901     ubrk_close(bi);
902    
903     if ( NULL != next ) {
904     ZVAL_LONG(next, start+ret_pos);
905     }
906    
907     RETURN_STRINGL(((char *)pstr), ret_pos, 1);
908     }
909    
910     /* }}} */
911    
912     /*
913     * Local variables:
914     * tab-width: 4
915     * c-basic-offset: 4
916     * End:
917     * vim600: fdm=marker
918     * vim: noet sw=4 ts=4
919     */
920    

Properties

Name Value
cvs2svn:cvs-rev 1.1.2.4
svn:eol-style native
svn:executable *
svn:keywords Id Rev Revision Date LastChangedDate LastChangedRevision Author LastChangedBy HeadURL URL
svn:mime-type text/x-c