00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifndef NORMLZR_H
00010 #define NORMLZR_H
00011
00012 #include "unicode/utypes.h"
00013 #include "unicode/unistr.h"
00014 #include "unicode/chariter.h"
00015 #include "unicode/unorm.h"
00016
00017 struct UCharIterator;
00018 typedef struct UCharIterator UCharIterator;
00019
00020 U_NAMESPACE_BEGIN
00111 class U_COMMON_API Normalizer
00112 {
00113 public:
00119 enum {
00120 DONE=0xffff
00121 };
00122
00123
00124
00135 Normalizer(const UnicodeString& str, UNormalizationMode mode);
00136
00148 Normalizer(const UChar* str, int32_t length, UNormalizationMode mode);
00149
00160 Normalizer(const CharacterIterator& iter, UNormalizationMode mode);
00161
00166 Normalizer(const Normalizer& copy);
00167
00172 ~Normalizer();
00173
00174
00175
00176
00177
00178
00196 static void normalize(const UnicodeString& source,
00197 UNormalizationMode mode, int32_t options,
00198 UnicodeString& result,
00199 UErrorCode &status);
00200
00222 static void compose(const UnicodeString& source,
00223 UBool compat, int32_t options,
00224 UnicodeString& result,
00225 UErrorCode &status);
00226
00249 static void decompose(const UnicodeString& source,
00250 UBool compat, int32_t options,
00251 UnicodeString& result,
00252 UErrorCode &status);
00253
00272 static UNormalizationCheckResult
00273 quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status);
00274
00275
00276
00277
00278
00287 UChar32 current(void);
00288
00297 UChar32 first(void);
00298
00307 UChar32 last(void);
00308
00317 UChar32 next(void);
00318
00327 UChar32 previous(void);
00328
00348 UChar32 setIndex(UTextOffset index);
00349
00359 void setIndexOnly(UTextOffset index);
00360
00366 void reset(void);
00367
00382 UTextOffset getIndex(void) const;
00383
00392 UTextOffset startIndex(void) const;
00393
00404 UTextOffset endIndex(void) const;
00405
00414 UBool operator==(const Normalizer& that) const;
00415
00424 inline UBool operator!=(const Normalizer& that) const;
00425
00432 Normalizer* clone(void) const;
00433
00440 int32_t hashCode(void) const;
00441
00442
00443
00444
00445
00461 void setMode(UNormalizationMode newMode);
00462
00473 UNormalizationMode getUMode(void) const;
00474
00491 void setOption(int32_t option,
00492 UBool value);
00493
00504 UBool getOption(int32_t option) const;
00505
00514 void setText(const UnicodeString& newText,
00515 UErrorCode &status);
00516
00525 void setText(const CharacterIterator& newText,
00526 UErrorCode &status);
00527
00537 void setText(const UChar* newText,
00538 int32_t length,
00539 UErrorCode &status);
00546 void getText(UnicodeString& result);
00547
00548
00549
00550
00551
00556 enum {
00557 COMPAT_BIT = 1,
00558 DECOMP_BIT = 2,
00559 COMPOSE_BIT = 4,
00560 FCD_BIT = 8
00561 };
00562
00567 enum EMode {
00581 NO_OP = 0,
00582
00598 COMPOSE = COMPOSE_BIT,
00599
00615 COMPOSE_COMPAT = COMPOSE_BIT | COMPAT_BIT,
00616
00632 DECOMP = DECOMP_BIT,
00633
00649 DECOMP_COMPAT = DECOMP_BIT | COMPAT_BIT,
00650
00654 FCD = FCD_BIT
00655 };
00656
00658 enum {
00677 IGNORE_HANGUL = 0x001
00678 };
00679
00690 Normalizer(const UnicodeString& str,
00691 EMode mode);
00692
00711 Normalizer(const UnicodeString& str,
00712 EMode mode,
00713 int32_t opt);
00714
00726 Normalizer(const UChar* str,
00727 int32_t length,
00728 EMode mode);
00729
00745 Normalizer(const UChar* str,
00746 int32_t length,
00747 EMode mode,
00748 int32_t option);
00749
00760 Normalizer(const CharacterIterator& iter,
00761 EMode mode);
00762
00778 Normalizer(const CharacterIterator& iter,
00779 EMode mode,
00780 int32_t opt);
00781
00802 inline static void
00803 normalize(const UnicodeString& source,
00804 EMode mode,
00805 int32_t options,
00806 UnicodeString& result,
00807 UErrorCode &status);
00808
00825 inline static UNormalizationCheckResult
00826 quickCheck(const UnicodeString& source,
00827 EMode mode,
00828 UErrorCode& status);
00829
00837 inline static UNormalizationMode getUNormalizationMode(EMode mode,
00838 UErrorCode& status);
00839
00847 inline static EMode getNormalizerEMode(UNormalizationMode mode,
00848 UErrorCode& status);
00849
00876 inline void setMode(EMode newMode);
00877
00884 inline EMode getMode(void) const;
00885
00886 private:
00887
00888
00889
00890
00891
00892
00893 UBool nextNormalize();
00894 UBool previousNormalize();
00895
00896 void init(CharacterIterator *iter);
00897 void clearBuffer(void);
00898
00899
00900
00901 inline static UNormalizationMode getUMode(EMode mode);
00902
00903
00904
00905
00906
00907 UNormalizationMode fUMode;
00908 int32_t fOptions;
00909
00910
00911 UCharIterator *text;
00912
00913
00914
00915 UTextOffset currentIndex, nextIndex;
00916
00917
00918 UnicodeString buffer;
00919 UTextOffset bufferPos;
00920 };
00921
00922
00923
00924
00925
00926 inline UBool
00927 Normalizer::operator!= (const Normalizer& other) const
00928 { return ! operator==(other); }
00929
00930 inline void
00931 Normalizer::normalize(const UnicodeString& source,
00932 EMode mode, int32_t options,
00933 UnicodeString& result,
00934 UErrorCode &status) {
00935 normalize(source, getUNormalizationMode(mode, status), options, result, status);
00936 }
00937
00938 inline UNormalizationCheckResult
00939 Normalizer::quickCheck(const UnicodeString& source,
00940 EMode mode,
00941 UErrorCode &status) {
00942 return quickCheck(source, getUNormalizationMode(mode, status), status);
00943 }
00944
00945 inline void
00946 Normalizer::setMode(EMode newMode) {
00947 UErrorCode status = U_ZERO_ERROR;
00948 fUMode = getUNormalizationMode(newMode, status);
00949 }
00950
00951 inline Normalizer::EMode
00952 Normalizer::getMode() const {
00953 UErrorCode status = U_ZERO_ERROR;
00954 return getNormalizerEMode(fUMode, status);
00955 }
00956
00957 inline UNormalizationMode Normalizer::getUNormalizationMode(
00958 Normalizer::EMode mode, UErrorCode &status)
00959 {
00960 if (U_SUCCESS(status))
00961 {
00962 switch (mode)
00963 {
00964 case Normalizer::NO_OP :
00965 return UNORM_NONE;
00966 case Normalizer::COMPOSE :
00967 return UNORM_NFC;
00968 case Normalizer::COMPOSE_COMPAT :
00969 return UNORM_NFKC;
00970 case Normalizer::DECOMP :
00971 return UNORM_NFD;
00972 case Normalizer::DECOMP_COMPAT :
00973 return UNORM_NFKD;
00974 case Normalizer::FCD:
00975 return UNORM_FCD;
00976 default :
00977 status = U_ILLEGAL_ARGUMENT_ERROR;
00978 }
00979 }
00980 return UNORM_DEFAULT;
00981 }
00982
00983 inline UNormalizationMode
00984 Normalizer::getUMode(Normalizer::EMode mode) {
00985 switch(mode) {
00986 case Normalizer::NO_OP :
00987 return UNORM_NONE;
00988 case Normalizer::COMPOSE :
00989 return UNORM_NFC;
00990 case Normalizer::COMPOSE_COMPAT :
00991 return UNORM_NFKC;
00992 case Normalizer::DECOMP :
00993 return UNORM_NFD;
00994 case Normalizer::DECOMP_COMPAT :
00995 return UNORM_NFKD;
00996 case Normalizer::FCD:
00997 return UNORM_FCD;
00998 default :
00999 return UNORM_DEFAULT;
01000 }
01001 }
01002
01003 inline Normalizer::EMode Normalizer::getNormalizerEMode(
01004 UNormalizationMode mode, UErrorCode &status)
01005 {
01006 if (U_SUCCESS(status))
01007 {
01008 switch (mode)
01009 {
01010 case UNORM_NONE :
01011 return Normalizer::NO_OP;
01012 case UNORM_NFD :
01013 return Normalizer::DECOMP;
01014 case UNORM_NFKD :
01015 return Normalizer::DECOMP_COMPAT;
01016 case UNORM_NFC :
01017 return Normalizer::COMPOSE;
01018 case UNORM_NFKC :
01019 return Normalizer::COMPOSE_COMPAT;
01020 case UNORM_FCD:
01021 return Normalizer::FCD;
01022 default :
01023 status = U_ILLEGAL_ARGUMENT_ERROR;
01024 }
01025 }
01026 return Normalizer::DECOMP_COMPAT;
01027 }
01028
01029 U_NAMESPACE_END
01030 #endif // _NORMLZR