00001 00010 /* 00011 * Copyright © 2008 Red Hat, Inc. All rights reserved. 00012 * Copyright © 2008 Ding-Yi Chen <dchen at redhat dot com> 00013 * 00014 * This file is part of the libUnihan Project. 00015 * 00016 * This library is free software; you can redistribute it and/or 00017 * modify it under the terms of the GNU Lesser General Public 00018 * License as published by the Free Software Foundation; either 00019 * version 2 of the License, or (at your option) any later version. 00020 * 00021 * This library is distributed in the hope that it will be useful, 00022 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00023 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00024 * GNU Lesser General Public License for more details. 00025 * 00026 * You should have received a copy of the GNU Lesser General Public 00027 * License along with this program; if not, write to the 00028 * Free Software Foundation, Inc., 59 Temple Place, Suite 330, 00029 * Boston, MA 02111-1307 USA 00030 */ 00031 00032 #ifndef STR_FUNCTIONS_H_ 00033 #define STR_FUNCTIONS_H_ 00034 #include <string.h> 00035 #include <glib.h> 00036 #include <sys/types.h> 00037 #include <regex.h> 00038 00045 #define CHAR_TO_UNSIGNEDINT(c) (unsigned int) ((int) c >=0)? c : c+256 00046 00053 #define CHAR_TO_UNSIGNEDCHAR(c) (unsigned char) ((int) c >=0)? c : c+256 00054 00065 typedef struct { 00066 GStringChunk *chunk; 00067 GPtrArray *ptrArray; 00068 GHashTable *hTable; 00069 guint len; 00070 size_t chunk_size_inital; 00071 } StringList; 00072 00078 StringList *stringList_new(); 00079 00090 StringList *stringList_sized_new(size_t chunk_size, size_t element_count); 00091 00097 void stringList_clear(StringList *sList); 00098 00109 int stringList_find_string(StringList *sList,const char* str); 00110 00111 00123 gboolean stringList_has_string(StringList *sList,const gchar* str); 00124 00138 gchar **stringList_to_charPointerPointer(StringList *sList); 00139 00147 const gchar *stringList_index(StringList *sList,guint index); 00148 00168 guint stringList_insert(StringList *sList, const gchar *str); 00169 00195 guint stringList_insert_const(StringList *sList, const gchar *str); 00196 00206 void stringList_free(StringList *sList); 00207 00209 // * @defgroup RegexResult_Match_Flags Regex substring match flags. 00210 // * @{ 00211 // * @name Regex substring match flags. 00212 // * 00213 // * Bitwise regex substring match flags. Use bit operators to combine the flags. 00214 // * 00215 // * If none of the flags are given, by default, the output will be like: 00216 // * <ol start="0"> 00217 // * <li>ab,cd</li> 00218 // * <li>ab</li> 00219 // * <li>cd</li> 00220 // * <li>ef,gh</li> 00221 // * <li>ef</li> 00222 // * <li>gh</li> 00223 // * </ol> 00224 // * 00225 // * If <code>REGEX_RESULT_ALLOW_OVERLAP</code> is given: 00226 // * <ol start="0"> 00227 // * <li>ab,cd</li> 00228 // * <li>ab</li> 00229 // * <li>cd</li> 00230 // * <li>cd,ef</li> 00231 // * <li>cd</li> 00232 // * <li>ef</li> 00233 // * <li>ef,gh</li> 00234 // * <li>ef</li> 00235 // * <li>gh</li> 00236 // * </ol> 00237 // * 00238 // * If <code>REGEX_RESULT_EXCLUDE_MAJOR_MATCH</code> is given: 00239 // * <ol start="0"> 00240 // * <li>ab</li> 00241 // * <li>cd</li> 00242 // * <li>ef</li> 00243 // * <li>gh</li> 00244 // * </ol> 00245 // * 00246 // * If <code>REGEX_RESULT_EXCLUDE_SUB_MATCH</code> is given: 00247 // * <ol start="0"> 00248 // * <li>ab,cd</li> 00249 // * <li>ef,gh</li> 00250 // * </ol> 00251 00252 // * @{ 00253 // */ 00254 00256 // * Flag to indicate that only the first result is needed. 00257 // * 00258 // * With this flag, results for \c aaa match \c a* will be \c aaa only. 00259 // * Which is the original behavior of regexec() 00260 // * 00261 // * \note This flag overrides REGEX_RESULT_ALLOW_OVERLAP. 00262 // */ 00263 //#define REGEX_RESULT_MATCH_ONCE 1 00265 // * Flag to indicate that result substrings can be overlapped. 00266 // * 00267 // * With this flag, results for \c aaa match \c a* will be \c aaa, \c aa, \c a, but not empty string. 00268 // * \note This flag has no effect if REGEX_RESULT_MATCH_ONCE is also set. 00269 // */ 00270 //#define REGEX_RESULT_ALLOW_OVERLAP 2 00272 // * Flag to indicate that major matches should be excluded. 00273 // * 00274 // * Major match means the matches of whole regex pattern. 00275 // * 00276 // * With this flag, results for \c abab matches \c a(b) will be \c b and \c b, 00277 // * but not major match \c ab. 00278 // */ 00279 //#define REGEX_RESULT_EXCLUDE_MAJOR_MATCH 4 00281 // * Flag to indicate that sub matches should be excluded. 00282 // * 00283 // * Sub match means the matches of the parenthesized sub regex pattern. 00284 // * With this flag, results for \c abab matches \c a(b) will be \c ab and \c ab, but not \c b. 00285 // */ 00286 //#define REGEX_RESULT_EXCLUDE_SUB_MATCH 8 00287 00289 // * @} 00290 // * @} 00291 // */ 00292 00294 // * @defgroup RegexResult_Match_Functions Regex substring match functions. 00295 // * @{ 00296 // * @name Regex substring match functions. 00297 // * 00298 // * These functions return a newly allocated StringList that holds a list of regex-matched substrings. 00299 // * They add substring match functionality to regexec() from \c regex.h. 00300 // * 00301 // * Contract to the intuition, regexec() only matches once even if REG_NOSUB is not set in regcomp(). 00302 // * The so-called \i sub-match actually means the sub expressions enclosed by '()' in POSIX extended, 00303 // * or '\(\)' in POSIX basic. 00304 // * For example, matches <code>ab,cd,ef,gh</code> with <code>([a-z]*),([a-z]*)</code> producing following output: 00305 // * <ol start="0"> 00306 // * <li>ab,cd</li> 00307 // * <li>ab</li> 00308 // * <li>cd</li> 00309 // * </ol> 00310 // * But not 00311 // * <ol start="3"> 00312 // * <li>ef,gh</li> 00313 // * <li>ef</li> 00314 // * <li>gh</li> 00315 // * </ol> 00316 // * and so on. 00317 // * 00318 // * With regexResult_match_regex_t(), subsequence substrings are reachable. 00319 // * The output can be filtered by using regex substring match flags. 00320 // * @{ 00321 // */ 00322 00323 00325 // * The data structure that holds result of regex match. 00326 // * 00327 // */ 00328 //typedef struct{ 00329 // StringList *resultList; //!< List of string that actually match the pattern. 00330 // GArray *startOffsets; //!< Start offset of the actual matched substrings. 00331 //} RegexResult; 00332 00334 // * New a RegexResult instance. 00335 // * 00336 // * @return A newly allocated RegexResult instance. 00337 // */ 00338 //RegexResult *regexResult_new(); 00339 00341 // * Free a RegexResult instance. 00342 // * 00343 // * @param rResult RegexResult to be freed. 00344 // */ 00345 //void regexResult_free(RegexResult *rResult); 00346 00348 // * Return regex-matched substrings. 00349 // * 00350 // * This function is a convenient wrap of regcomp() and 00351 // * regexResult_match_regex_t(). 00352 // * It compiles the regex_t from \a pattern using regcomp(), 00353 // * then call the regexResult_match_regex_t() for matched result. 00354 // * 00355 // * Use regexResult_free() to free the result. 00356 // * 00357 // * If the compilation fails, NULL will be returned. 00358 // * 00359 // * \note REG_NOSUB cannot be used in cflags, because regexec does not 00360 // * fill the data to array of \c regmatch_t. 00361 // * 00362 // * @param pattern Regex pattern. 00363 // * @param str String to be matched. 00364 // * @param cflags flags to be passed to regcomp(). 00365 // * @param eflags eflag to be passed to regexec(). 00366 // * @param regexResultFlags RegexResult_Match_Flags 00367 // * @return a newly allocated RegexResult, 00368 // * number of matches is indicated by RegexResult->resultList->len. len=0 if no matches; 00369 // * NULL if \c pattern does not pass the compilation. 00370 // * 00371 // * @see RegexResult_Match_Functions 00372 // * @see RegexResult_Match_Flags 00373 // * @see regexResult_match_regex_t() 00374 // */ 00375 //RegexResult *regexResult_match(const gchar *pattern,const gchar *str, 00376 // int cflags, int eflags, guint regexResultFlags); 00377 00379 // * Return regex-matched substrings, given an instance of regex_t. 00380 // * 00381 // * This function adds subsequence substring handling routine to regexec(), and 00382 // * returns a newly allocated StringList that holds a list of regex-matched substrings in \a str. 00383 // * See RegexResult_Match for further explanation, and RegexResult_Match_Flags for output control. 00384 // * 00385 // * @param preg Regex instance generate by regcomp(). 00386 // * @param str String to be matched. 00387 // * @param eflags eflag to be passed to regexec(). 00388 // * @param regexResultFlags RegexResult_Match_Flags 00389 // * @return a newly allocated RegexResult, 00390 // * number of matches is indicated by RegexResult->resultList->len. len=0 if no matches; 00391 // * 00392 // * @see RegexResult_Match_Functions 00393 // * @see RegexResult_Match_Flags 00394 // * @see regexResult_match() 00395 // */ 00396 //RegexResult *regexResult_match_regex_t( 00397 // regex_t *preg, 00398 // const gchar *str, int eflags, guint regexResultFlags); 00399 00401 // * @} 00402 // * @} 00403 // */ 00404 // 00405 00483 gchar *string_formatted_combine(const gchar *format,StringList *sList,int *counter_ptr); 00484 00523 gchar *string_regex_formatted_combine_regex_t(const gchar *str, const regex_t *preg, const gchar *format, 00524 int eflags, int *counter_ptr); 00525 00548 gchar *string_regex_formatted_combine(const gchar *str, const gchar *pattern, const gchar *format, 00549 int cflags, int eflags, int *counter_ptr); 00550 00551 00575 gchar *string_regex_replace_regex_t(const gchar *str, const regex_t *preg, const gchar *format, 00576 int eflags, int *counter_ptr); 00577 00602 gchar *string_regex_replace(const gchar *str, const gchar *pattern, const gchar *format, 00603 int cflags, int eflags, int *counter_ptr); 00604 00617 gchar* 00618 initString(gchar *str); 00619 00620 00627 gboolean 00628 isEmptyString(const gchar *str); 00629 00638 void string_trim(gchar *str); 00639 00640 00655 gchar* 00656 subString(const gchar *str,int beginIndex, int length); 00657 00675 gchar* 00676 subString_buffer(gchar *buf,const gchar *str,int beginIndex, int length); 00677 00691 gchar* string_append_c(gchar *str, const char ch,size_t length); 00692 00705 gboolean string_is_decomposed_fast(const gchar *str); 00706 00717 gchar* string_padding_left(const gchar *str, const gchar *padding_str, size_t length); 00718 00729 gchar* string_padding_right(const gchar *str, const gchar *padding_str, size_t length); 00730 00737 char* ucs4_to_utf8(gunichar ucs4_code); 00738 00745 gunichar* utf8_to_ucs4(const char* utf8_str); 00746 00754 gchar* utf8_concat_ucs4(gchar* utf8_str,gunichar ucs4_code); 00755 00768 int strcmp_unsigned_signed(const unsigned char *str1, const gchar *str2); 00769 00780 unsigned char *signedStr_to_unsignedStr(const gchar *str); 00781 00792 unsigned char *signedStr_to_unsignedStr_buffer(unsigned char *resultBuf, const gchar *str); 00793 00804 char *unsignedStr_to_signedStr(const unsigned char *str); 00805 00816 gchar *unsignedStr_to_signedStr_buffer(gchar* resultBuf, const unsigned char *str); 00817 00818 00819 #endif /*STR_FUNCTIONS_H_*/