initial commit, 4.5 stable
Some checks failed
🔗 GHA / 📊 Static checks (push) Has been cancelled
🔗 GHA / 🤖 Android (push) Has been cancelled
🔗 GHA / 🍏 iOS (push) Has been cancelled
🔗 GHA / 🐧 Linux (push) Has been cancelled
🔗 GHA / 🍎 macOS (push) Has been cancelled
🔗 GHA / 🏁 Windows (push) Has been cancelled
🔗 GHA / 🌐 Web (push) Has been cancelled

This commit is contained in:
2025-09-16 20:46:46 -04:00
commit 9d30169a8d
13378 changed files with 7050105 additions and 0 deletions

View File

@@ -0,0 +1,239 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2011-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: appendable.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010dec07
* created by: Markus W. Scherer
*/
#ifndef __APPENDABLE_H__
#define __APPENDABLE_H__
/**
* \file
* \brief C++ API: Appendable class: Sink for Unicode code points and 16-bit code units (char16_ts).
*/
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/uobject.h"
U_NAMESPACE_BEGIN
class UnicodeString;
/**
* Base class for objects to which Unicode characters and strings can be appended.
* Combines elements of Java Appendable and ICU4C ByteSink.
*
* This class can be used in APIs where it does not matter whether the actual destination is
* a UnicodeString, a char16_t[] array, a UnicodeSet, or any other object
* that receives and processes characters and/or strings.
*
* Implementation classes must implement at least appendCodeUnit(char16_t).
* The base class provides default implementations for the other methods.
*
* The methods do not take UErrorCode parameters.
* If an error occurs (e.g., out-of-memory),
* in addition to returning false from failing operations,
* the implementation must prevent unexpected behavior (e.g., crashes)
* from further calls and should make the error condition available separately
* (e.g., store a UErrorCode, make/keep a UnicodeString bogus).
* @stable ICU 4.8
*/
class U_COMMON_API Appendable : public UObject {
public:
/**
* Destructor.
* @stable ICU 4.8
*/
~Appendable();
/**
* Appends a 16-bit code unit.
* @param c code unit
* @return true if the operation succeeded
* @stable ICU 4.8
*/
virtual UBool appendCodeUnit(char16_t c) = 0;
/**
* Appends a code point.
* The default implementation calls appendCodeUnit(char16_t) once or twice.
* @param c code point 0..0x10ffff
* @return true if the operation succeeded
* @stable ICU 4.8
*/
virtual UBool appendCodePoint(UChar32 c);
/**
* Appends a string.
* The default implementation calls appendCodeUnit(char16_t) for each code unit.
* @param s string, must not be nullptr if length!=0
* @param length string length, or -1 if NUL-terminated
* @return true if the operation succeeded
* @stable ICU 4.8
*/
virtual UBool appendString(const char16_t *s, int32_t length);
/**
* Tells the object that the caller is going to append roughly
* appendCapacity char16_ts. A subclass might use this to pre-allocate
* a larger buffer if necessary.
* The default implementation does nothing. (It always returns true.)
* @param appendCapacity estimated number of char16_ts that will be appended
* @return true if the operation succeeded
* @stable ICU 4.8
*/
virtual UBool reserveAppendCapacity(int32_t appendCapacity);
/**
* Returns a writable buffer for appending and writes the buffer's capacity to
* *resultCapacity. Guarantees *resultCapacity>=minCapacity.
* May return a pointer to the caller-owned scratch buffer which must have
* scratchCapacity>=minCapacity.
* The returned buffer is only valid until the next operation
* on this Appendable.
*
* After writing at most *resultCapacity char16_ts, call appendString() with the
* pointer returned from this function and the number of char16_ts written.
* Many appendString() implementations will avoid copying char16_ts if this function
* returned an internal buffer.
*
* Partial usage example:
* \code
* int32_t capacity;
* char16_t* buffer = app.getAppendBuffer(..., &capacity);
* ... Write n char16_ts into buffer, with n <= capacity.
* app.appendString(buffer, n);
* \endcode
* In many implementations, that call to append will avoid copying char16_ts.
*
* If the Appendable allocates or reallocates an internal buffer, it should use
* the desiredCapacityHint if appropriate.
* If a caller cannot provide a reasonable guess at the desired capacity,
* it should pass desiredCapacityHint=0.
*
* If a non-scratch buffer is returned, the caller may only pass
* a prefix to it to appendString().
* That is, it is not correct to pass an interior pointer to appendString().
*
* The default implementation always returns the scratch buffer.
*
* @param minCapacity required minimum capacity of the returned buffer;
* must be non-negative
* @param desiredCapacityHint desired capacity of the returned buffer;
* must be non-negative
* @param scratch default caller-owned buffer
* @param scratchCapacity capacity of the scratch buffer
* @param resultCapacity pointer to an integer which will be set to the
* capacity of the returned buffer
* @return a buffer with *resultCapacity>=minCapacity
* @stable ICU 4.8
*/
virtual char16_t *getAppendBuffer(int32_t minCapacity,
int32_t desiredCapacityHint,
char16_t *scratch, int32_t scratchCapacity,
int32_t *resultCapacity);
};
/**
* An Appendable implementation which writes to a UnicodeString.
*
* This class is not intended for public subclassing.
* @stable ICU 4.8
*/
class U_COMMON_API UnicodeStringAppendable : public Appendable {
public:
/**
* Aliases the UnicodeString (keeps its reference) for writing.
* @param s The UnicodeString to which this Appendable will write.
* @stable ICU 4.8
*/
explicit UnicodeStringAppendable(UnicodeString &s) : str(s) {}
/**
* Destructor.
* @stable ICU 4.8
*/
~UnicodeStringAppendable();
/**
* Appends a 16-bit code unit to the string.
* @param c code unit
* @return true if the operation succeeded
* @stable ICU 4.8
*/
virtual UBool appendCodeUnit(char16_t c) override;
/**
* Appends a code point to the string.
* @param c code point 0..0x10ffff
* @return true if the operation succeeded
* @stable ICU 4.8
*/
virtual UBool appendCodePoint(UChar32 c) override;
/**
* Appends a string to the UnicodeString.
* @param s string, must not be nullptr if length!=0
* @param length string length, or -1 if NUL-terminated
* @return true if the operation succeeded
* @stable ICU 4.8
*/
virtual UBool appendString(const char16_t *s, int32_t length) override;
/**
* Tells the UnicodeString that the caller is going to append roughly
* appendCapacity char16_ts.
* @param appendCapacity estimated number of char16_ts that will be appended
* @return true if the operation succeeded
* @stable ICU 4.8
*/
virtual UBool reserveAppendCapacity(int32_t appendCapacity) override;
/**
* Returns a writable buffer for appending and writes the buffer's capacity to
* *resultCapacity. Guarantees *resultCapacity>=minCapacity.
* May return a pointer to the caller-owned scratch buffer which must have
* scratchCapacity>=minCapacity.
* The returned buffer is only valid until the next write operation
* on the UnicodeString.
*
* For details see Appendable::getAppendBuffer().
*
* @param minCapacity required minimum capacity of the returned buffer;
* must be non-negative
* @param desiredCapacityHint desired capacity of the returned buffer;
* must be non-negative
* @param scratch default caller-owned buffer
* @param scratchCapacity capacity of the scratch buffer
* @param resultCapacity pointer to an integer which will be set to the
* capacity of the returned buffer
* @return a buffer with *resultCapacity>=minCapacity
* @stable ICU 4.8
*/
virtual char16_t *getAppendBuffer(int32_t minCapacity,
int32_t desiredCapacityHint,
char16_t *scratch, int32_t scratchCapacity,
int32_t *resultCapacity) override;
private:
UnicodeString &str;
};
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // __APPENDABLE_H__

View File

@@ -0,0 +1,672 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
********************************************************************************
* Copyright (C) 1997-2016, International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************************
*
* File brkiter.h
*
* Modification History:
*
* Date Name Description
* 02/18/97 aliu Added typedef for TextCount. Made DONE const.
* 05/07/97 aliu Fixed DLL declaration.
* 07/09/97 jfitz Renamed BreakIterator and interface synced with JDK
* 08/11/98 helena Sync-up JDK1.2.
* 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods.
********************************************************************************
*/
#ifndef BRKITER_H
#define BRKITER_H
#include "unicode/utypes.h"
/**
* \file
* \brief C++ API: Break Iterator.
*/
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#if UCONFIG_NO_BREAK_ITERATION
U_NAMESPACE_BEGIN
/*
* Allow the declaration of APIs with pointers to BreakIterator
* even when break iteration is removed from the build.
*/
class BreakIterator;
U_NAMESPACE_END
#else
#include "unicode/uobject.h"
#include "unicode/unistr.h"
#include "unicode/chariter.h"
#include "unicode/locid.h"
#include "unicode/ubrk.h"
#include "unicode/strenum.h"
#include "unicode/utext.h"
#include "unicode/umisc.h"
U_NAMESPACE_BEGIN
class CharString;
/**
* The BreakIterator class implements methods for finding the location
* of boundaries in text. BreakIterator is an abstract base class.
* Instances of BreakIterator maintain a current position and scan over
* text returning the index of characters where boundaries occur.
* <p>
* Line boundary analysis determines where a text string can be broken
* when line-wrapping. The mechanism correctly handles punctuation and
* hyphenated words.
* <p>
* Sentence boundary analysis allows selection with correct
* interpretation of periods within numbers and abbreviations, and
* trailing punctuation marks such as quotation marks and parentheses.
* <p>
* Word boundary analysis is used by search and replace functions, as
* well as within text editing applications that allow the user to
* select words with a double click. Word selection provides correct
* interpretation of punctuation marks within and following
* words. Characters that are not part of a word, such as symbols or
* punctuation marks, have word-breaks on both sides.
* <p>
* Character boundary analysis allows users to interact with
* characters as they expect to, for example, when moving the cursor
* through a text string. Character boundary analysis provides correct
* navigation of through character strings, regardless of how the
* character is stored. For example, an accented character might be
* stored as a base character and a diacritical mark. What users
* consider to be a character can differ between languages.
* <p>
* The text boundary positions are found according to the rules
* described in Unicode Standard Annex #29, Text Boundaries, and
* Unicode Standard Annex #14, Line Breaking Properties. These
* are available at http://www.unicode.org/reports/tr14/ and
* http://www.unicode.org/reports/tr29/.
* <p>
* In addition to the C++ API defined in this header file, a
* plain C API with equivalent functionality is defined in the
* file ubrk.h
* <p>
* Code snippets illustrating the use of the Break Iterator APIs
* are available in the ICU User Guide,
* https://unicode-org.github.io/icu/userguide/boundaryanalysis/
* and in the sample program icu/source/samples/break/break.cpp
*
*/
class U_COMMON_API BreakIterator : public UObject {
public:
/**
* destructor
* @stable ICU 2.0
*/
virtual ~BreakIterator();
/**
* Return true if another object is semantically equal to this
* one. The other object should be an instance of the same subclass of
* BreakIterator. Objects of different subclasses are considered
* unequal.
* <P>
* Return true if this BreakIterator is at the same position in the
* same text, and is the same class and type (word, line, etc.) of
* BreakIterator, as the argument. Text is considered the same if
* it contains the same characters, it need not be the same
* object, and styles are not considered.
* @stable ICU 2.0
*/
virtual bool operator==(const BreakIterator&) const = 0;
/**
* Returns the complement of the result of operator==
* @param rhs The BreakIterator to be compared for inequality
* @return the complement of the result of operator==
* @stable ICU 2.0
*/
bool operator!=(const BreakIterator& rhs) const { return !operator==(rhs); }
/**
* Return a polymorphic copy of this object. This is an abstract
* method which subclasses implement.
* @stable ICU 2.0
*/
virtual BreakIterator* clone() const = 0;
/**
* Return a polymorphic class ID for this object. Different subclasses
* will return distinct unequal values.
* @stable ICU 2.0
*/
virtual UClassID getDynamicClassID() const override = 0;
/**
* Return a CharacterIterator over the text being analyzed.
* @stable ICU 2.0
*/
virtual CharacterIterator& getText() const = 0;
/**
* Get a UText for the text being analyzed.
* The returned UText is a shallow clone of the UText used internally
* by the break iterator implementation. It can safely be used to
* access the text without impacting any break iterator operations,
* but the underlying text itself must not be altered.
*
* @param fillIn A UText to be filled in. If nullptr, a new UText will be
* allocated to hold the result.
* @param status receives any error codes.
* @return The current UText for this break iterator. If an input
* UText was provided, it will always be returned.
* @stable ICU 3.4
*/
virtual UText *getUText(UText *fillIn, UErrorCode &status) const = 0;
/**
* Change the text over which this operates. The text boundary is
* reset to the start.
*
* The BreakIterator will retain a reference to the supplied string.
* The caller must not modify or delete the text while the BreakIterator
* retains the reference.
*
* @param text The UnicodeString used to change the text.
* @stable ICU 2.0
*/
virtual void setText(const UnicodeString &text) = 0;
/**
* Reset the break iterator to operate over the text represented by
* the UText. The iterator position is reset to the start.
*
* This function makes a shallow clone of the supplied UText. This means
* that the caller is free to immediately close or otherwise reuse the
* Utext that was passed as a parameter, but that the underlying text itself
* must not be altered while being referenced by the break iterator.
*
* All index positions returned by break iterator functions are
* native indices from the UText. For example, when breaking UTF-8
* encoded text, the break positions returned by next(), previous(), etc.
* will be UTF-8 string indices, not UTF-16 positions.
*
* @param text The UText used to change the text.
* @param status receives any error codes.
* @stable ICU 3.4
*/
virtual void setText(UText *text, UErrorCode &status) = 0;
/**
* Change the text over which this operates. The text boundary is
* reset to the start.
* Note that setText(UText *) provides similar functionality to this function,
* and is more efficient.
* @param it The CharacterIterator used to change the text.
* @stable ICU 2.0
*/
virtual void adoptText(CharacterIterator* it) = 0;
enum {
/**
* DONE is returned by previous() and next() after all valid
* boundaries have been returned.
* @stable ICU 2.0
*/
DONE = static_cast<int32_t>(-1)
};
/**
* Sets the current iteration position to the beginning of the text, position zero.
* @return The offset of the beginning of the text, zero.
* @stable ICU 2.0
*/
virtual int32_t first() = 0;
/**
* Set the iterator position to the index immediately BEYOND the last character in the text being scanned.
* @return The index immediately BEYOND the last character in the text being scanned.
* @stable ICU 2.0
*/
virtual int32_t last() = 0;
/**
* Set the iterator position to the boundary preceding the current boundary.
* @return The character index of the previous text boundary or DONE if all
* boundaries have been returned.
* @stable ICU 2.0
*/
virtual int32_t previous() = 0;
/**
* Advance the iterator to the boundary following the current boundary.
* @return The character index of the next text boundary or DONE if all
* boundaries have been returned.
* @stable ICU 2.0
*/
virtual int32_t next() = 0;
/**
* Return character index of the current iterator position within the text.
* @return The boundary most recently returned.
* @stable ICU 2.0
*/
virtual int32_t current() const = 0;
/**
* Advance the iterator to the first boundary following the specified offset.
* The value returned is always greater than the offset or
* the value BreakIterator.DONE
* @param offset the offset to begin scanning.
* @return The first boundary after the specified offset.
* @stable ICU 2.0
*/
virtual int32_t following(int32_t offset) = 0;
/**
* Set the iterator position to the first boundary preceding the specified offset.
* The value returned is always smaller than the offset or
* the value BreakIterator.DONE
* @param offset the offset to begin scanning.
* @return The first boundary before the specified offset.
* @stable ICU 2.0
*/
virtual int32_t preceding(int32_t offset) = 0;
/**
* Return true if the specified position is a boundary position.
* As a side effect, the current position of the iterator is set
* to the first boundary position at or following the specified offset.
* @param offset the offset to check.
* @return True if "offset" is a boundary position.
* @stable ICU 2.0
*/
virtual UBool isBoundary(int32_t offset) = 0;
/**
* Set the iterator position to the nth boundary from the current boundary
* @param n the number of boundaries to move by. A value of 0
* does nothing. Negative values move to previous boundaries
* and positive values move to later boundaries.
* @return The new iterator position, or
* DONE if there are fewer than |n| boundaries in the specified direction.
* @stable ICU 2.0
*/
virtual int32_t next(int32_t n) = 0;
/**
* For RuleBasedBreakIterators, return the status tag from the break rule
* that determined the boundary at the current iteration position.
* <p>
* For break iterator types that do not support a rule status,
* a default value of 0 is returned.
* <p>
* @return the status from the break rule that determined the boundary at
* the current iteration position.
* @see RuleBaseBreakIterator::getRuleStatus()
* @see UWordBreak
* @stable ICU 52
*/
virtual int32_t getRuleStatus() const;
/**
* For RuleBasedBreakIterators, get the status (tag) values from the break rule(s)
* that determined the boundary at the current iteration position.
* <p>
* For break iterator types that do not support rule status,
* no values are returned.
* <p>
* The returned status value(s) are stored into an array provided by the caller.
* The values are stored in sorted (ascending) order.
* If the capacity of the output array is insufficient to hold the data,
* the output will be truncated to the available length, and a
* U_BUFFER_OVERFLOW_ERROR will be signaled.
* <p>
* @see RuleBaseBreakIterator::getRuleStatusVec
*
* @param fillInVec an array to be filled in with the status values.
* @param capacity the length of the supplied vector. A length of zero causes
* the function to return the number of status values, in the
* normal way, without attempting to store any values.
* @param status receives error codes.
* @return The number of rule status values from rules that determined
* the boundary at the current iteration position.
* In the event of a U_BUFFER_OVERFLOW_ERROR, the return value
* is the total number of status values that were available,
* not the reduced number that were actually returned.
* @see getRuleStatus
* @stable ICU 52
*/
virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status);
/**
* Create BreakIterator for word-breaks using the given locale.
* Returns an instance of a BreakIterator implementing word breaks.
* WordBreak is useful for word selection (ex. double click)
* @param where the locale.
* @param status the error code
* @return A BreakIterator for word-breaks. The UErrorCode& status
* parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
* can check for informational error results which still indicate success.
* U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
* example, 'de_CH' was requested, but nothing was found there, so 'de' was
* used. U_USING_DEFAULT_WARNING indicates that the default locale data was
* used; neither the requested locale nor any of its fall back locales
* could be found.
* The caller owns the returned object and is responsible for deleting it.
* @stable ICU 2.0
*/
static BreakIterator* U_EXPORT2
createWordInstance(const Locale& where, UErrorCode& status);
/**
* Create BreakIterator for line-breaks using specified locale.
* Returns an instance of a BreakIterator implementing line breaks. Line
* breaks are logically possible line breaks, actual line breaks are
* usually determined based on display width.
* LineBreak is useful for word wrapping text.
* @param where the locale.
* @param status The error code.
* @return A BreakIterator for line-breaks. The UErrorCode& status
* parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
* can check for informational error results which still indicate success.
* U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
* example, 'de_CH' was requested, but nothing was found there, so 'de' was
* used. U_USING_DEFAULT_WARNING indicates that the default locale data was
* used; neither the requested locale nor any of its fall back locales
* could be found.
* The caller owns the returned object and is responsible for deleting it.
* @stable ICU 2.0
*/
static BreakIterator* U_EXPORT2
createLineInstance(const Locale& where, UErrorCode& status);
/**
* Create BreakIterator for character-breaks using specified locale
* Returns an instance of a BreakIterator implementing character breaks.
* Character breaks are boundaries of combining character sequences.
* @param where the locale.
* @param status The error code.
* @return A BreakIterator for character-breaks. The UErrorCode& status
* parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
* can check for informational error results which still indicate success.
* U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
* example, 'de_CH' was requested, but nothing was found there, so 'de' was
* used. U_USING_DEFAULT_WARNING indicates that the default locale data was
* used; neither the requested locale nor any of its fall back locales
* could be found.
* The caller owns the returned object and is responsible for deleting it.
* @stable ICU 2.0
*/
static BreakIterator* U_EXPORT2
createCharacterInstance(const Locale& where, UErrorCode& status);
/**
* Create BreakIterator for sentence-breaks using specified locale
* Returns an instance of a BreakIterator implementing sentence breaks.
* @param where the locale.
* @param status The error code.
* @return A BreakIterator for sentence-breaks. The UErrorCode& status
* parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
* can check for informational error results which still indicate success.
* U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
* example, 'de_CH' was requested, but nothing was found there, so 'de' was
* used. U_USING_DEFAULT_WARNING indicates that the default locale data was
* used; neither the requested locale nor any of its fall back locales
* could be found.
* The caller owns the returned object and is responsible for deleting it.
* @stable ICU 2.0
*/
static BreakIterator* U_EXPORT2
createSentenceInstance(const Locale& where, UErrorCode& status);
#ifndef U_HIDE_DEPRECATED_API
/**
* Create BreakIterator for title-casing breaks using the specified locale
* Returns an instance of a BreakIterator implementing title breaks.
* The iterator returned locates title boundaries as described for
* Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
* please use a word boundary iterator. See {@link #createWordInstance }.
*
* @param where the locale.
* @param status The error code.
* @return A BreakIterator for title-breaks. The UErrorCode& status
* parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
* can check for informational error results which still indicate success.
* U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
* example, 'de_CH' was requested, but nothing was found there, so 'de' was
* used. U_USING_DEFAULT_WARNING indicates that the default locale data was
* used; neither the requested locale nor any of its fall back locales
* could be found.
* The caller owns the returned object and is responsible for deleting it.
* @deprecated ICU 64 Use createWordInstance instead.
*/
static BreakIterator* U_EXPORT2
createTitleInstance(const Locale& where, UErrorCode& status);
#endif /* U_HIDE_DEPRECATED_API */
/**
* Get the set of Locales for which TextBoundaries are installed.
* <p><b>Note:</b> this will not return locales added through the register
* call. To see the registered locales too, use the getAvailableLocales
* function that returns a StringEnumeration object </p>
* @param count the output parameter of number of elements in the locale list
* @return available locales
* @stable ICU 2.0
*/
static const Locale* U_EXPORT2 getAvailableLocales(int32_t& count);
/**
* Get name of the object for the desired Locale, in the desired language.
* @param objectLocale must be from getAvailableLocales.
* @param displayLocale specifies the desired locale for output.
* @param name the fill-in parameter of the return value
* Uses best match.
* @return user-displayable name
* @stable ICU 2.0
*/
static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale,
const Locale& displayLocale,
UnicodeString& name);
/**
* Get name of the object for the desired Locale, in the language of the
* default locale.
* @param objectLocale must be from getMatchingLocales
* @param name the fill-in parameter of the return value
* @return user-displayable name
* @stable ICU 2.0
*/
static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale,
UnicodeString& name);
#ifndef U_FORCE_HIDE_DEPRECATED_API
/**
* Deprecated functionality. Use clone() instead.
*
* Thread safe client-buffer-based cloning operation
* Do NOT call delete on a safeclone, since 'new' is not used to create it.
* @param stackBuffer user allocated space for the new clone. If nullptr new memory will be allocated.
* If buffer is not large enough, new memory will be allocated.
* @param BufferSize reference to size of allocated space.
* If BufferSize == 0, a sufficient size for use in cloning will
* be returned ('pre-flighting')
* If BufferSize is not enough for a stack-based safe clone,
* new memory will be allocated.
* @param status to indicate whether the operation went on smoothly or there were errors
* An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were
* necessary.
* @return pointer to the new clone
*
* @deprecated ICU 52. Use clone() instead.
*/
virtual BreakIterator * createBufferClone(void *stackBuffer,
int32_t &BufferSize,
UErrorCode &status) = 0;
#endif // U_FORCE_HIDE_DEPRECATED_API
#ifndef U_HIDE_DEPRECATED_API
/**
* Determine whether the BreakIterator was created in user memory by
* createBufferClone(), and thus should not be deleted. Such objects
* must be closed by an explicit call to the destructor (not delete).
* @deprecated ICU 52. Always delete the BreakIterator.
*/
inline UBool isBufferClone();
#endif /* U_HIDE_DEPRECATED_API */
#if !UCONFIG_NO_SERVICE
/**
* Register a new break iterator of the indicated kind, to use in the given locale.
* The break iterator will be adopted. Clones of the iterator will be returned
* if a request for a break iterator of the given kind matches or falls back to
* this locale.
* Because ICU may choose to cache BreakIterators internally, this must
* be called at application startup, prior to any calls to
* BreakIterator::createXXXInstance to avoid undefined behavior.
* @param toAdopt the BreakIterator instance to be adopted
* @param locale the Locale for which this instance is to be registered
* @param kind the type of iterator for which this instance is to be registered
* @param status the in/out status code, no special meanings are assigned
* @return a registry key that can be used to unregister this instance
* @stable ICU 2.4
*/
static URegistryKey U_EXPORT2 registerInstance(BreakIterator* toAdopt,
const Locale& locale,
UBreakIteratorType kind,
UErrorCode& status);
/**
* Unregister a previously-registered BreakIterator using the key returned from the
* register call. Key becomes invalid after a successful call and should not be used again.
* The BreakIterator corresponding to the key will be deleted.
* Because ICU may choose to cache BreakIterators internally, this should
* be called during application shutdown, after all calls to
* BreakIterator::createXXXInstance to avoid undefined behavior.
* @param key the registry key returned by a previous call to registerInstance
* @param status the in/out status code, no special meanings are assigned
* @return true if the iterator for the key was successfully unregistered
* @stable ICU 2.4
*/
static UBool U_EXPORT2 unregister(URegistryKey key, UErrorCode& status);
/**
* Return a StringEnumeration over the locales available at the time of the call,
* including registered locales.
* @return a StringEnumeration over the locales available at the time of the call
* @stable ICU 2.4
*/
static StringEnumeration* U_EXPORT2 getAvailableLocales();
#endif
/**
* Returns the locale for this break iterator. Two flavors are available: valid and
* actual locale.
* @stable ICU 2.8
*/
Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;
#ifndef U_HIDE_INTERNAL_API
/** Get the locale for this break iterator object. You can choose between valid and actual locale.
* @param type type of the locale we're looking for (valid or actual)
* @param status error code for the operation
* @return the locale
* @internal
*/
const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
#endif /* U_HIDE_INTERNAL_API */
/**
* Set the subject text string upon which the break iterator is operating
* without changing any other aspect of the matching state.
* The new and previous text strings must have the same content.
*
* This function is intended for use in environments where ICU is operating on
* strings that may move around in memory. It provides a mechanism for notifying
* ICU that the string has been relocated, and providing a new UText to access the
* string in its new position.
*
* Note that the break iterator implementation never copies the underlying text
* of a string being processed, but always operates directly on the original text
* provided by the user. Refreshing simply drops the references to the old text
* and replaces them with references to the new.
*
* Caution: this function is normally used only by very specialized,
* system-level code. One example use case is with garbage collection that moves
* the text in memory.
*
* @param input The new (moved) text string.
* @param status Receives errors detected by this function.
* @return *this
*
* @stable ICU 49
*/
virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0;
private:
static BreakIterator* buildInstance(const Locale& loc, const char *type, UErrorCode& status);
static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);
static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);
friend class ICUBreakIteratorFactory;
friend class ICUBreakIteratorService;
protected:
// Do not enclose protected default/copy constructors with #ifndef U_HIDE_INTERNAL_API
// or else the compiler will create a public ones.
/** @internal */
BreakIterator();
/** @internal */
BreakIterator (const BreakIterator &other);
#ifndef U_HIDE_INTERNAL_API
/** @internal */
BreakIterator (const Locale& valid, const Locale &actual);
/** @internal. Assignment Operator, used by RuleBasedBreakIterator. */
BreakIterator &operator = (const BreakIterator &other);
#endif /* U_HIDE_INTERNAL_API */
private:
/** @internal (private) */
CharString* actualLocale = nullptr;
CharString* validLocale = nullptr;
CharString* requestLocale = nullptr;
};
#ifndef U_HIDE_DEPRECATED_API
inline UBool BreakIterator::isBufferClone()
{
return false;
}
#endif /* U_HIDE_DEPRECATED_API */
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // BRKITER_H
//eof

View File

@@ -0,0 +1,307 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// Copyright (C) 2009-2012, International Business Machines
// Corporation and others. All Rights Reserved.
//
// Copyright 2007 Google Inc. All Rights Reserved.
// Author: sanjay@google.com (Sanjay Ghemawat)
//
// Abstract interface that consumes a sequence of bytes (ByteSink).
//
// Used so that we can write a single piece of code that can operate
// on a variety of output string types.
//
// Various implementations of this interface are provided:
// ByteSink:
// CheckedArrayByteSink Write to a flat array, with bounds checking
// StringByteSink Write to an STL string
// This code is a contribution of Google code, and the style used here is
// a compromise between the original Google code and the ICU coding guidelines.
// For example, data types are ICU-ified (size_t,int->int32_t),
// and API comments doxygen-ified, but function names and behavior are
// as in the original, if possible.
// Assertion-style error handling, not available in ICU, was changed to
// parameter "pinning" similar to UnicodeString.
//
// In addition, this is only a partial port of the original Google code,
// limited to what was needed so far. The (nearly) complete original code
// is in the ICU svn repository at icuhtml/trunk/design/strings/contrib
// (see ICU ticket 6765, r25517).
#ifndef __BYTESTREAM_H__
#define __BYTESTREAM_H__
/**
* \file
* \brief C++ API: Interface for writing bytes, and implementation classes.
*/
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/uobject.h"
#include "unicode/std_string.h"
U_NAMESPACE_BEGIN
/**
* A ByteSink can be filled with bytes.
* @stable ICU 4.2
*/
class U_COMMON_API ByteSink : public UMemory {
public:
/**
* Default constructor.
* @stable ICU 4.2
*/
ByteSink() { }
/**
* Virtual destructor.
* @stable ICU 4.2
*/
virtual ~ByteSink();
/**
* Append "bytes[0,n-1]" to this.
* @param bytes the pointer to the bytes
* @param n the number of bytes; must be non-negative
* @stable ICU 4.2
*/
virtual void Append(const char* bytes, int32_t n) = 0;
/**
* Appends n bytes to this. Same as Append().
* Call AppendU8() with u8"string literals" which are const char * in C++11
* but const char8_t * in C++20.
* If the compiler does support char8_t as a distinct type,
* then an AppendU8() overload for that is defined and will be chosen.
*
* @param bytes the pointer to the bytes
* @param n the number of bytes; must be non-negative
* @stable ICU 67
*/
inline void AppendU8(const char* bytes, int32_t n) {
Append(bytes, n);
}
#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
/**
* Appends n bytes to this. Same as Append() but for a const char8_t * pointer.
* Call AppendU8() with u8"string literals" which are const char * in C++11
* but const char8_t * in C++20.
* If the compiler does support char8_t as a distinct type,
* then this AppendU8() overload for that is defined and will be chosen.
*
* @param bytes the pointer to the bytes
* @param n the number of bytes; must be non-negative
* @stable ICU 67
*/
inline void AppendU8(const char8_t* bytes, int32_t n) {
Append(reinterpret_cast<const char*>(bytes), n);
}
#endif
/**
* Returns a writable buffer for appending and writes the buffer's capacity to
* *result_capacity. Guarantees *result_capacity>=min_capacity.
* May return a pointer to the caller-owned scratch buffer which must have
* scratch_capacity>=min_capacity.
* The returned buffer is only valid until the next operation
* on this ByteSink.
*
* After writing at most *result_capacity bytes, call Append() with the
* pointer returned from this function and the number of bytes written.
* Many Append() implementations will avoid copying bytes if this function
* returned an internal buffer.
*
* Partial usage example:
* int32_t capacity;
* char* buffer = sink->GetAppendBuffer(..., &capacity);
* ... Write n bytes into buffer, with n <= capacity.
* sink->Append(buffer, n);
* In many implementations, that call to Append will avoid copying bytes.
*
* If the ByteSink allocates or reallocates an internal buffer, it should use
* the desired_capacity_hint if appropriate.
* If a caller cannot provide a reasonable guess at the desired capacity,
* it should pass desired_capacity_hint=0.
*
* If a non-scratch buffer is returned, the caller may only pass
* a prefix to it to Append().
* That is, it is not correct to pass an interior pointer to Append().
*
* The default implementation always returns the scratch buffer.
*
* @param min_capacity required minimum capacity of the returned buffer;
* must be non-negative
* @param desired_capacity_hint desired capacity of the returned buffer;
* must be non-negative
* @param scratch default caller-owned buffer
* @param scratch_capacity capacity of the scratch buffer
* @param result_capacity pointer to an integer which will be set to the
* capacity of the returned buffer
* @return a buffer with *result_capacity>=min_capacity
* @stable ICU 4.2
*/
virtual char* GetAppendBuffer(int32_t min_capacity,
int32_t desired_capacity_hint,
char* scratch, int32_t scratch_capacity,
int32_t* result_capacity);
/**
* Flush internal buffers.
* Some byte sinks use internal buffers or provide buffering
* and require calling Flush() at the end of the stream.
* The ByteSink should be ready for further Append() calls after Flush().
* The default implementation of Flush() does nothing.
* @stable ICU 4.2
*/
virtual void Flush();
private:
ByteSink(const ByteSink &) = delete;
ByteSink &operator=(const ByteSink &) = delete;
};
// -------------------------------------------------------------
// Some standard implementations
/**
* Implementation of ByteSink that writes to a flat byte array,
* with bounds-checking:
* This sink will not write more than capacity bytes to outbuf.
* If more than capacity bytes are Append()ed, then excess bytes are ignored,
* and Overflowed() will return true.
* Overflow does not cause a runtime error.
* @stable ICU 4.2
*/
class U_COMMON_API CheckedArrayByteSink : public ByteSink {
public:
/**
* Constructs a ByteSink that will write to outbuf[0..capacity-1].
* @param outbuf buffer to write to
* @param capacity size of the buffer
* @stable ICU 4.2
*/
CheckedArrayByteSink(char* outbuf, int32_t capacity);
/**
* Destructor.
* @stable ICU 4.2
*/
virtual ~CheckedArrayByteSink();
/**
* Returns the sink to its original state, without modifying the buffer.
* Useful for reusing both the buffer and the sink for multiple streams.
* Resets the state to NumberOfBytesWritten()=NumberOfBytesAppended()=0
* and Overflowed()=false.
* @return *this
* @stable ICU 4.6
*/
virtual CheckedArrayByteSink& Reset();
/**
* Append "bytes[0,n-1]" to this.
* @param bytes the pointer to the bytes
* @param n the number of bytes; must be non-negative
* @stable ICU 4.2
*/
virtual void Append(const char* bytes, int32_t n) override;
/**
* Returns a writable buffer for appending and writes the buffer's capacity to
* *result_capacity. For details see the base class documentation.
* @param min_capacity required minimum capacity of the returned buffer;
* must be non-negative
* @param desired_capacity_hint desired capacity of the returned buffer;
* must be non-negative
* @param scratch default caller-owned buffer
* @param scratch_capacity capacity of the scratch buffer
* @param result_capacity pointer to an integer which will be set to the
* capacity of the returned buffer
* @return a buffer with *result_capacity>=min_capacity
* @stable ICU 4.2
*/
virtual char* GetAppendBuffer(int32_t min_capacity,
int32_t desired_capacity_hint,
char* scratch, int32_t scratch_capacity,
int32_t* result_capacity) override;
/**
* Returns the number of bytes actually written to the sink.
* @return number of bytes written to the buffer
* @stable ICU 4.2
*/
int32_t NumberOfBytesWritten() const { return size_; }
/**
* Returns true if any bytes were discarded, i.e., if there was an
* attempt to write more than 'capacity' bytes.
* @return true if more than 'capacity' bytes were Append()ed
* @stable ICU 4.2
*/
UBool Overflowed() const { return overflowed_; }
/**
* Returns the number of bytes appended to the sink.
* If Overflowed() then NumberOfBytesAppended()>NumberOfBytesWritten()
* else they return the same number.
* @return number of bytes written to the buffer
* @stable ICU 4.6
*/
int32_t NumberOfBytesAppended() const { return appended_; }
private:
char* outbuf_;
const int32_t capacity_;
int32_t size_;
int32_t appended_;
UBool overflowed_;
CheckedArrayByteSink() = delete;
CheckedArrayByteSink(const CheckedArrayByteSink &) = delete;
CheckedArrayByteSink &operator=(const CheckedArrayByteSink &) = delete;
};
/**
* Implementation of ByteSink that writes to a "string".
* The StringClass is usually instantiated with a std::string.
* @stable ICU 4.2
*/
template<typename StringClass>
class StringByteSink : public ByteSink {
public:
/**
* Constructs a ByteSink that will append bytes to the dest string.
* @param dest pointer to string object to append to
* @stable ICU 4.2
*/
StringByteSink(StringClass* dest) : dest_(dest) { }
/**
* Constructs a ByteSink that reserves append capacity and will append bytes to the dest string.
*
* @param dest pointer to string object to append to
* @param initialAppendCapacity capacity beyond dest->length() to be reserve()d
* @stable ICU 60
*/
StringByteSink(StringClass* dest, int32_t initialAppendCapacity) : dest_(dest) {
if (initialAppendCapacity > 0 &&
static_cast<uint32_t>(initialAppendCapacity) > dest->capacity() - dest->length()) {
dest->reserve(dest->length() + initialAppendCapacity);
}
}
/**
* Append "bytes[0,n-1]" to this.
* @param data the pointer to the bytes
* @param n the number of bytes; must be non-negative
* @stable ICU 4.2
*/
virtual void Append(const char* data, int32_t n) override { dest_->append(data, n); }
private:
StringClass* dest_;
StringByteSink() = delete;
StringByteSink(const StringByteSink &) = delete;
StringByteSink &operator=(const StringByteSink &) = delete;
};
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // __BYTESTREAM_H__

View File

@@ -0,0 +1,568 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2010-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: bytestrie.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010sep25
* created by: Markus W. Scherer
*/
#ifndef __BYTESTRIE_H__
#define __BYTESTRIE_H__
/**
* \file
* \brief C++ API: Trie for mapping byte sequences to integer values.
*/
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/stringpiece.h"
#include "unicode/uobject.h"
#include "unicode/ustringtrie.h"
class BytesTrieTest;
U_NAMESPACE_BEGIN
class ByteSink;
class BytesTrieBuilder;
class CharString;
class UVector32;
/**
* Light-weight, non-const reader class for a BytesTrie.
* Traverses a byte-serialized data structure with minimal state,
* for mapping byte sequences to non-negative integer values.
*
* This class owns the serialized trie data only if it was constructed by
* the builder's build() method.
* The public constructor and the copy constructor only alias the data (only copy the pointer).
* There is no assignment operator.
*
* This class is not intended for public subclassing.
* @stable ICU 4.8
*/
class U_COMMON_API BytesTrie : public UMemory {
public:
/**
* Constructs a BytesTrie reader instance.
*
* The trieBytes must contain a copy of a byte sequence from the BytesTrieBuilder,
* starting with the first byte of that sequence.
* The BytesTrie object will not read more bytes than
* the BytesTrieBuilder generated in the corresponding build() call.
*
* The array is not copied/cloned and must not be modified while
* the BytesTrie object is in use.
*
* @param trieBytes The byte array that contains the serialized trie.
* @stable ICU 4.8
*/
BytesTrie(const void *trieBytes)
: ownedArray_(nullptr), bytes_(static_cast<const uint8_t *>(trieBytes)),
pos_(bytes_), remainingMatchLength_(-1) {}
/**
* Destructor.
* @stable ICU 4.8
*/
~BytesTrie();
/**
* Copy constructor, copies the other trie reader object and its state,
* but not the byte array which will be shared. (Shallow copy.)
* @param other Another BytesTrie object.
* @stable ICU 4.8
*/
BytesTrie(const BytesTrie &other)
: ownedArray_(nullptr), bytes_(other.bytes_),
pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
/**
* Resets this trie to its initial state.
* @return *this
* @stable ICU 4.8
*/
BytesTrie &reset() {
pos_=bytes_;
remainingMatchLength_=-1;
return *this;
}
/**
* Returns the state of this trie as a 64-bit integer.
* The state value is never 0.
*
* @return opaque state value
* @see resetToState64
* @stable ICU 65
*/
uint64_t getState64() const {
return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) |
static_cast<uint64_t>(pos_ - bytes_);
}
/**
* Resets this trie to the saved state.
* Unlike resetToState(State), the 64-bit state value
* must be from getState64() from the same trie object or
* from one initialized the exact same way.
* Because of no validation, this method is faster.
*
* @param state The opaque trie state value from getState64().
* @return *this
* @see getState64
* @see resetToState
* @see reset
* @stable ICU 65
*/
BytesTrie &resetToState64(uint64_t state) {
remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2;
pos_ = bytes_ + (state & kState64PosMask);
return *this;
}
/**
* BytesTrie state object, for saving a trie's current state
* and resetting the trie back to this state later.
* @stable ICU 4.8
*/
class State : public UMemory {
public:
/**
* Constructs an empty State.
* @stable ICU 4.8
*/
State() { bytes=nullptr; }
private:
friend class BytesTrie;
const uint8_t *bytes;
const uint8_t *pos;
int32_t remainingMatchLength;
};
/**
* Saves the state of this trie.
* @param state The State object to hold the trie's state.
* @return *this
* @see resetToState
* @stable ICU 4.8
*/
const BytesTrie &saveState(State &state) const {
state.bytes=bytes_;
state.pos=pos_;
state.remainingMatchLength=remainingMatchLength_;
return *this;
}
/**
* Resets this trie to the saved state.
* If the state object contains no state, or the state of a different trie,
* then this trie remains unchanged.
* @param state The State object which holds a saved trie state.
* @return *this
* @see saveState
* @see reset
* @stable ICU 4.8
*/
BytesTrie &resetToState(const State &state) {
if(bytes_==state.bytes && bytes_!=nullptr) {
pos_=state.pos;
remainingMatchLength_=state.remainingMatchLength;
}
return *this;
}
/**
* Determines whether the byte sequence so far matches, whether it has a value,
* and whether another input byte can continue a matching byte sequence.
* @return The match/value Result.
* @stable ICU 4.8
*/
UStringTrieResult current() const;
/**
* Traverses the trie from the initial state for this input byte.
* Equivalent to reset().next(inByte).
* @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
* Values below -0x100 and above 0xff will never match.
* @return The match/value Result.
* @stable ICU 4.8
*/
inline UStringTrieResult first(int32_t inByte) {
remainingMatchLength_=-1;
if(inByte<0) {
inByte+=0x100;
}
return nextImpl(bytes_, inByte);
}
/**
* Traverses the trie from the current state for this input byte.
* @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
* Values below -0x100 and above 0xff will never match.
* @return The match/value Result.
* @stable ICU 4.8
*/
UStringTrieResult next(int32_t inByte);
/**
* Traverses the trie from the current state for this byte sequence.
* Equivalent to
* \code
* Result result=current();
* for(each c in s)
* if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH;
* result=next(c);
* return result;
* \endcode
* @param s A string or byte sequence. Can be nullptr if length is 0.
* @param length The length of the byte sequence. Can be -1 if NUL-terminated.
* @return The match/value Result.
* @stable ICU 4.8
*/
UStringTrieResult next(const char *s, int32_t length);
/**
* Returns a matching byte sequence's value if called immediately after
* current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE.
* getValue() can be called multiple times.
*
* Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE!
* @return The value for the byte sequence so far.
* @stable ICU 4.8
*/
inline int32_t getValue() const {
const uint8_t *pos=pos_;
int32_t leadByte=*pos++;
// U_ASSERT(leadByte>=kMinValueLead);
return readValue(pos, leadByte>>1);
}
/**
* Determines whether all byte sequences reachable from the current state
* map to the same value.
* @param uniqueValue Receives the unique value, if this function returns true.
* (output-only)
* @return true if all byte sequences reachable from the current state
* map to the same value.
* @stable ICU 4.8
*/
inline UBool hasUniqueValue(int32_t &uniqueValue) const {
const uint8_t *pos=pos_;
// Skip the rest of a pending linear-match node.
return pos!=nullptr && findUniqueValue(pos+remainingMatchLength_+1, false, uniqueValue);
}
/**
* Finds each byte which continues the byte sequence from the current state.
* That is, each byte b for which it would be next(b)!=USTRINGTRIE_NO_MATCH now.
* @param out Each next byte is appended to this object.
* (Only uses the out.Append(s, length) method.)
* @return the number of bytes which continue the byte sequence from here
* @stable ICU 4.8
*/
int32_t getNextBytes(ByteSink &out) const;
/**
* Iterator for all of the (byte sequence, value) pairs in a BytesTrie.
* @stable ICU 4.8
*/
class U_COMMON_API Iterator : public UMemory {
public:
/**
* Iterates from the root of a byte-serialized BytesTrie.
* @param trieBytes The trie bytes.
* @param maxStringLength If 0, the iterator returns full strings/byte sequences.
* Otherwise, the iterator returns strings with this maximum length.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @stable ICU 4.8
*/
Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
/**
* Iterates from the current state of the specified BytesTrie.
* @param trie The trie whose state will be copied for iteration.
* @param maxStringLength If 0, the iterator returns full strings/byte sequences.
* Otherwise, the iterator returns strings with this maximum length.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @stable ICU 4.8
*/
Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
/**
* Destructor.
* @stable ICU 4.8
*/
~Iterator();
/**
* Resets this iterator to its initial state.
* @return *this
* @stable ICU 4.8
*/
Iterator &reset();
/**
* @return true if there are more elements.
* @stable ICU 4.8
*/
UBool hasNext() const;
/**
* Finds the next (byte sequence, value) pair if there is one.
*
* If the byte sequence is truncated to the maximum length and does not
* have a real value, then the value is set to -1.
* In this case, this "not a real value" is indistinguishable from
* a real value of -1.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return true if there is another element.
* @stable ICU 4.8
*/
UBool next(UErrorCode &errorCode);
/**
* @return The NUL-terminated byte sequence for the last successful next().
* @stable ICU 4.8
*/
StringPiece getString() const;
/**
* @return The value for the last successful next().
* @stable ICU 4.8
*/
int32_t getValue() const { return value_; }
private:
UBool truncateAndStop();
const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
const uint8_t *bytes_;
const uint8_t *pos_;
const uint8_t *initialPos_;
int32_t remainingMatchLength_;
int32_t initialRemainingMatchLength_;
CharString *str_;
int32_t maxLength_;
int32_t value_;
// The stack stores pairs of integers for backtracking to another
// outbound edge of a branch node.
// The first integer is an offset from bytes_.
// The second integer has the str_->length() from before the node in bits 15..0,
// and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
// (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
// but the code looks more confusing that way.)
UVector32 *stack_;
};
private:
friend class BytesTrieBuilder;
friend class ::BytesTrieTest;
/**
* Constructs a BytesTrie reader instance.
* Unlike the public constructor which just aliases an array,
* this constructor adopts the builder's array.
* This constructor is only called by the builder.
*/
BytesTrie(void *adoptBytes, const void *trieBytes)
: ownedArray_(static_cast<uint8_t *>(adoptBytes)),
bytes_(static_cast<const uint8_t *>(trieBytes)),
pos_(bytes_), remainingMatchLength_(-1) {}
// No assignment operator.
BytesTrie &operator=(const BytesTrie &other) = delete;
inline void stop() {
pos_=nullptr;
}
// Reads a compact 32-bit integer.
// pos is already after the leadByte, and the lead byte is already shifted right by 1.
static int32_t readValue(const uint8_t *pos, int32_t leadByte);
static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) {
// U_ASSERT(leadByte>=kMinValueLead);
if(leadByte>=(kMinTwoByteValueLead<<1)) {
if(leadByte<(kMinThreeByteValueLead<<1)) {
++pos;
} else if(leadByte<(kFourByteValueLead<<1)) {
pos+=2;
} else {
pos+=3+((leadByte>>1)&1);
}
}
return pos;
}
static inline const uint8_t *skipValue(const uint8_t *pos) {
int32_t leadByte=*pos++;
return skipValue(pos, leadByte);
}
// Reads a jump delta and jumps.
static const uint8_t *jumpByDelta(const uint8_t *pos);
static inline const uint8_t *skipDelta(const uint8_t *pos) {
int32_t delta=*pos++;
if(delta>=kMinTwoByteDeltaLead) {
if(delta<kMinThreeByteDeltaLead) {
++pos;
} else if(delta<kFourByteDeltaLead) {
pos+=2;
} else {
pos+=3+(delta&1);
}
}
return pos;
}
static inline UStringTrieResult valueResult(int32_t node) {
return static_cast<UStringTrieResult>(USTRINGTRIE_INTERMEDIATE_VALUE - (node & kValueIsFinal));
}
// Handles a branch node for both next(byte) and next(string).
UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte);
// Requires remainingLength_<0.
UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte);
// Helper functions for hasUniqueValue().
// Recursively finds a unique value (or whether there is not a unique one)
// from a branch.
static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
UBool haveUniqueValue, int32_t &uniqueValue);
// Recursively finds a unique value (or whether there is not a unique one)
// starting from a position on a node lead byte.
static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
// Helper functions for getNextBytes().
// getNextBytes() when pos is on a branch node.
static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out);
static void append(ByteSink &out, int c);
// BytesTrie data structure
//
// The trie consists of a series of byte-serialized nodes for incremental
// string/byte sequence matching. The root node is at the beginning of the trie data.
//
// Types of nodes are distinguished by their node lead byte ranges.
// After each node, except a final-value node, another node follows to
// encode match values or continue matching further bytes.
//
// Node types:
// - Value node: Stores a 32-bit integer in a compact, variable-length format.
// The value is for the string/byte sequence so far.
// One node bit indicates whether the value is final or whether
// matching continues with the next node.
// - Linear-match node: Matches a number of bytes.
// - Branch node: Branches to other nodes according to the current input byte.
// The node byte is the length of the branch (number of bytes to select from)
// minus 1. It is followed by a sub-node:
// - If the length is at most kMaxBranchLinearSubNodeLength, then
// there are length-1 (key, value) pairs and then one more comparison byte.
// If one of the key bytes matches, then the value is either a final value for
// the string/byte sequence so far, or a "jump" delta to the next node.
// If the last byte matches, then matching continues with the next node.
// (Values have the same encoding as value nodes.)
// - If the length is greater than kMaxBranchLinearSubNodeLength, then
// there is one byte and one "jump" delta.
// If the input byte is less than the sub-node byte, then "jump" by delta to
// the next sub-node which will have a length of length/2.
// (The delta has its own compact encoding.)
// Otherwise, skip the "jump" delta to the next sub-node
// which will have a length of length-length/2.
// Node lead byte values.
// 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
// the length is one more than the next byte.
// For a branch sub-node with at most this many entries, we drop down
// to a linear search.
static const int32_t kMaxBranchLinearSubNodeLength=5;
// 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.
static const int32_t kMinLinearMatch=0x10;
static const int32_t kMaxLinearMatchLength=0x10;
// 20..ff: Variable-length value node.
// If odd, the value is final. (Otherwise, intermediate value or jump delta.)
// Then shift-right by 1 bit.
// The remaining lead byte value indicates the number of following bytes (0..4)
// and contains the value's top bits.
static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20
// It is a final value if bit 0 is set.
static const int32_t kValueIsFinal=1;
// Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.
static const int32_t kMinOneByteValueLead=kMinValueLead/2; // 0x10
static const int32_t kMaxOneByteValue=0x40; // At least 6 bits in the first byte.
static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51
static const int32_t kMaxTwoByteValue=0x1aff;
static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c
static const int32_t kFourByteValueLead=0x7e;
// A little more than Unicode code points. (0x11ffff)
static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
static const int32_t kFiveByteValueLead=0x7f;
// Compact delta integers.
static const int32_t kMaxOneByteDelta=0xbf;
static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0
static const int32_t kMinThreeByteDeltaLead=0xf0;
static const int32_t kFourByteDeltaLead=0xfe;
static const int32_t kFiveByteDeltaLead=0xff;
static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff
static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff
// For getState64():
// The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2
// so we need at least 5 bits for that.
// We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength.
static constexpr int32_t kState64RemainingShift = 59;
static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1;
uint8_t *ownedArray_;
// Fixed value referencing the BytesTrie bytes.
const uint8_t *bytes_;
// Iterator variables.
// Pointer to next trie byte to read. nullptr if no more matches.
const uint8_t *pos_;
// Remaining length of a linear-match node, minus 1. Negative if not in such a node.
int32_t remainingMatchLength_;
};
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // __BYTESTRIE_H__

View File

@@ -0,0 +1,193 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2010-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: bytestriebuilder.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010sep25
* created by: Markus W. Scherer
*/
/**
* \file
* \brief C++ API: Builder for icu::BytesTrie
*/
#ifndef __BYTESTRIEBUILDER_H__
#define __BYTESTRIEBUILDER_H__
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/bytestrie.h"
#include "unicode/stringpiece.h"
#include "unicode/stringtriebuilder.h"
class BytesTrieTest;
U_NAMESPACE_BEGIN
class BytesTrieElement;
class CharString;
/**
* Builder class for BytesTrie.
*
* This class is not intended for public subclassing.
* @stable ICU 4.8
*/
class U_COMMON_API BytesTrieBuilder : public StringTrieBuilder {
public:
/**
* Constructs an empty builder.
* @param errorCode Standard ICU error code.
* @stable ICU 4.8
*/
BytesTrieBuilder(UErrorCode &errorCode);
/**
* Destructor.
* @stable ICU 4.8
*/
virtual ~BytesTrieBuilder();
/**
* Adds a (byte sequence, value) pair.
* The byte sequence must be unique.
* The bytes will be copied; the builder does not keep
* a reference to the input StringPiece or its data().
* @param s The input byte sequence.
* @param value The value associated with this byte sequence.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return *this
* @stable ICU 4.8
*/
BytesTrieBuilder &add(StringPiece s, int32_t value, UErrorCode &errorCode);
/**
* Builds a BytesTrie for the add()ed data.
* Once built, no further data can be add()ed until clear() is called.
*
* A BytesTrie cannot be empty. At least one (byte sequence, value) pair
* must have been add()ed.
*
* This method passes ownership of the builder's internal result array to the new trie object.
* Another call to any build() variant will re-serialize the trie.
* After clear() has been called, a new array will be used as well.
* @param buildOption Build option, see UStringTrieBuildOption.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return A new BytesTrie for the add()ed data.
* @stable ICU 4.8
*/
BytesTrie *build(UStringTrieBuildOption buildOption, UErrorCode &errorCode);
/**
* Builds a BytesTrie for the add()ed data and byte-serializes it.
* Once built, no further data can be add()ed until clear() is called.
*
* A BytesTrie cannot be empty. At least one (byte sequence, value) pair
* must have been add()ed.
*
* Multiple calls to buildStringPiece() return StringPieces referring to the
* builder's same byte array, without rebuilding.
* If buildStringPiece() is called after build(), the trie will be
* re-serialized into a new array (because build() passes on ownership).
* If build() is called after buildStringPiece(), the trie object returned
* by build() will become the owner of the underlying string for the
* previously returned StringPiece.
* After clear() has been called, a new array will be used as well.
* @param buildOption Build option, see UStringTrieBuildOption.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return A StringPiece which refers to the byte-serialized BytesTrie for the add()ed data.
* @stable ICU 4.8
*/
StringPiece buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode);
/**
* Removes all (byte sequence, value) pairs.
* New data can then be add()ed and a new trie can be built.
* @return *this
* @stable ICU 4.8
*/
BytesTrieBuilder &clear();
private:
friend class ::BytesTrieTest;
BytesTrieBuilder(const BytesTrieBuilder &other) = delete; // no copy constructor
BytesTrieBuilder &operator=(const BytesTrieBuilder &other) = delete; // no assignment operator
void buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode);
virtual int32_t getElementStringLength(int32_t i) const override;
virtual char16_t getElementUnit(int32_t i, int32_t byteIndex) const override;
virtual int32_t getElementValue(int32_t i) const override;
virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const override;
virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const override;
virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const override;
virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, char16_t byte) const override;
virtual UBool matchNodesCanHaveValues() const override { return false; }
virtual int32_t getMaxBranchLinearSubNodeLength() const override { return BytesTrie::kMaxBranchLinearSubNodeLength; }
virtual int32_t getMinLinearMatch() const override { return BytesTrie::kMinLinearMatch; }
virtual int32_t getMaxLinearMatchLength() const override { return BytesTrie::kMaxLinearMatchLength; }
/**
* @internal (private)
*/
class BTLinearMatchNode : public LinearMatchNode {
public:
BTLinearMatchNode(const char *units, int32_t len, Node *nextNode);
virtual bool operator==(const Node &other) const override;
virtual void write(StringTrieBuilder &builder) override;
private:
const char *s;
};
virtual Node *createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
Node *nextNode) const override;
UBool ensureCapacity(int32_t length);
virtual int32_t write(int32_t byte) override;
int32_t write(const char *b, int32_t length);
virtual int32_t writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) override;
virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) override;
virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) override;
virtual int32_t writeDeltaTo(int32_t jumpTarget) override;
static int32_t internalEncodeDelta(int32_t i, char intBytes[]);
CharString *strings; // Pointer not object so we need not #include internal charstr.h.
BytesTrieElement *elements;
int32_t elementsCapacity;
int32_t elementsLength;
// Byte serialization of the trie.
// Grows from the back: bytesLength measures from the end of the buffer!
char *bytes;
int32_t bytesCapacity;
int32_t bytesLength;
};
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // __BYTESTRIEBUILDER_H__

View File

@@ -0,0 +1,215 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
#ifndef CANITER_H
#define CANITER_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/uobject.h"
#include "unicode/unistr.h"
/**
* \file
* \brief C++ API: Canonical Iterator
*/
/** Should permutation skip characters with combining class zero
* Should be either true or false. This is a compile time option
* @stable ICU 2.4
*/
#ifndef CANITER_SKIP_ZEROES
#define CANITER_SKIP_ZEROES true
#endif
U_NAMESPACE_BEGIN
class Hashtable;
class Normalizer2;
class Normalizer2Impl;
/**
* This class allows one to iterate through all the strings that are canonically equivalent to a given
* string. For example, here are some sample results:
Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
1: \\u0041\\u030A\\u0064\\u0307\\u0327
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
2: \\u0041\\u030A\\u0064\\u0327\\u0307
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
3: \\u0041\\u030A\\u1E0B\\u0327
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
4: \\u0041\\u030A\\u1E11\\u0307
= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
5: \\u00C5\\u0064\\u0307\\u0327
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
6: \\u00C5\\u0064\\u0327\\u0307
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
7: \\u00C5\\u1E0B\\u0327
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
8: \\u00C5\\u1E11\\u0307
= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
9: \\u212B\\u0064\\u0307\\u0327
= {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
10: \\u212B\\u0064\\u0327\\u0307
= {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
11: \\u212B\\u1E0B\\u0327
= {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
12: \\u212B\\u1E11\\u0307
= {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
*<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
* since it has not been optimized for that situation.
* Note, CanonicalIterator is not intended to be subclassed.
* @author M. Davis
* @author C++ port by V. Weinstein
* @stable ICU 2.4
*/
class U_COMMON_API CanonicalIterator final : public UObject {
public:
/**
* Construct a CanonicalIterator object
* @param source string to get results for
* @param status Fill-in parameter which receives the status of this operation.
* @stable ICU 2.4
*/
CanonicalIterator(const UnicodeString &source, UErrorCode &status);
/** Destructor
* Cleans pieces
* @stable ICU 2.4
*/
virtual ~CanonicalIterator();
/**
* Gets the NFD form of the current source we are iterating over.
* @return gets the source: NOTE: it is the NFD form of source
* @stable ICU 2.4
*/
UnicodeString getSource();
/**
* Resets the iterator so that one can start again from the beginning.
* @stable ICU 2.4
*/
void reset();
/**
* Get the next canonically equivalent string.
* <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
* @return the next string that is canonically equivalent. A bogus string is returned when
* the iteration is done.
* @stable ICU 2.4
*/
UnicodeString next();
/**
* Set a new source for this iterator. Allows object reuse.
* @param newSource the source string to iterate against. This allows the same iterator to be used
* while changing the source string, saving object creation.
* @param status Fill-in parameter which receives the status of this operation.
* @stable ICU 2.4
*/
void setSource(const UnicodeString &newSource, UErrorCode &status);
#ifndef U_HIDE_INTERNAL_API
/**
* Dumb recursive implementation of permutation.
* TODO: optimize
* @param source the string to find permutations for
* @param skipZeros determine if skip zeros
* @param result the results in a set.
* @param status Fill-in parameter which receives the status of this operation.
* @param depth depth of the call.
* @internal
*/
static void U_EXPORT2 permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status, int32_t depth=0);
#endif /* U_HIDE_INTERNAL_API */
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*
* @stable ICU 2.2
*/
static UClassID U_EXPORT2 getStaticClassID();
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*
* @stable ICU 2.2
*/
virtual UClassID getDynamicClassID() const override;
private:
// ===================== PRIVATES ==============================
// private default constructor
CanonicalIterator() = delete;
/**
* Copy constructor. Private for now.
* @internal (private)
*/
CanonicalIterator(const CanonicalIterator& other) = delete;
/**
* Assignment operator. Private for now.
* @internal (private)
*/
CanonicalIterator& operator=(const CanonicalIterator& other) = delete;
// fields
UnicodeString source;
UBool done;
// 2 dimensional array holds the pieces of the string with
// their different canonically equivalent representations
UnicodeString **pieces;
int32_t pieces_length;
int32_t *pieces_lengths;
// current is used in iterating to combine pieces
int32_t *current;
int32_t current_length;
// transient fields
UnicodeString buffer;
const Normalizer2 *nfd;
const Normalizer2Impl *nfcImpl;
// we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
UnicodeString *getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status); //private String[] getEquivalents(String segment)
//Set getEquivalents2(String segment);
Hashtable *getEquivalents2(Hashtable *fillinResult, const char16_t *segment, int32_t segLen, UErrorCode &status);
//Hashtable *getEquivalents2(const UnicodeString &segment, int32_t segLen, UErrorCode &status);
/**
* See if the decomposition of cp2 is at segment starting at segmentPos
* (with canonical rearrangement!)
* If so, take the remainder, and return the equivalents
*/
//Set extract(int comp, String segment, int segmentPos, StringBuffer buffer);
Hashtable *extract(Hashtable *fillinResult, UChar32 comp, const char16_t *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status);
//Hashtable *extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status);
void cleanPieces();
};
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_NORMALIZATION */
#endif /* U_SHOW_CPLUSPLUS_API */
#endif

View File

@@ -0,0 +1,497 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// casemap.h
// created: 2017jan12 Markus W. Scherer
#ifndef __CASEMAP_H__
#define __CASEMAP_H__
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/stringpiece.h"
#include "unicode/uobject.h"
/**
* \file
* \brief C++ API: Low-level C++ case mapping functions.
*/
U_NAMESPACE_BEGIN
class BreakIterator;
class ByteSink;
class Edits;
/**
* Low-level C++ case mapping functions.
*
* @stable ICU 59
*/
class U_COMMON_API CaseMap final : public UMemory {
public:
/**
* Lowercases a UTF-16 string and optionally records edits.
* Casing is locale-dependent and context-sensitive.
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* @param locale The locale ID. ("" = root locale, nullptr = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param dest A buffer for the result string. The result will be NUL-terminated if
* the buffer is large enough.
* The contents is undefined in case of failure.
* @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then
* dest may be nullptr and the function will only return the length of the result
* without writing any of the result string.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be nullptr.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful.
* When the result would be longer than destCapacity,
* the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
*
* @see u_strToLower
* @stable ICU 59
*/
static int32_t toLower(
const char *locale, uint32_t options,
const char16_t *src, int32_t srcLength,
char16_t *dest, int32_t destCapacity, Edits *edits,
UErrorCode &errorCode);
/**
* Uppercases a UTF-16 string and optionally records edits.
* Casing is locale-dependent and context-sensitive.
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* @param locale The locale ID. ("" = root locale, nullptr = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param dest A buffer for the result string. The result will be NUL-terminated if
* the buffer is large enough.
* The contents is undefined in case of failure.
* @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then
* dest may be nullptr and the function will only return the length of the result
* without writing any of the result string.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be nullptr.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful.
* When the result would be longer than destCapacity,
* the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
*
* @see u_strToUpper
* @stable ICU 59
*/
static int32_t toUpper(
const char *locale, uint32_t options,
const char16_t *src, int32_t srcLength,
char16_t *dest, int32_t destCapacity, Edits *edits,
UErrorCode &errorCode);
#if !UCONFIG_NO_BREAK_ITERATION
/**
* Titlecases a UTF-16 string and optionally records edits.
* Casing is locale-dependent and context-sensitive.
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* Titlecasing uses a break iterator to find the first characters of words
* that are to be titlecased. It titlecases those characters and lowercases
* all others. (This can be modified with options bits.)
*
* @param locale The locale ID. ("" = root locale, nullptr = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
* U_TITLECASE_NO_LOWERCASE,
* U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
* U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
* @param iter A break iterator to find the first characters of words that are to be titlecased.
* It is set to the source string (setText())
* and used one or more times for iteration (first() and next()).
* If nullptr, then a word break iterator for the locale is used
* (or something equivalent).
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param dest A buffer for the result string. The result will be NUL-terminated if
* the buffer is large enough.
* The contents is undefined in case of failure.
* @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then
* dest may be nullptr and the function will only return the length of the result
* without writing any of the result string.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be nullptr.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful.
* When the result would be longer than destCapacity,
* the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
*
* @see u_strToTitle
* @see ucasemap_toTitle
* @stable ICU 59
*/
static int32_t toTitle(
const char *locale, uint32_t options, BreakIterator *iter,
const char16_t *src, int32_t srcLength,
char16_t *dest, int32_t destCapacity, Edits *edits,
UErrorCode &errorCode);
#endif // UCONFIG_NO_BREAK_ITERATION
/**
* Case-folds a UTF-16 string and optionally records edits.
*
* Case folding is locale-independent and not context-sensitive,
* but there is an option for whether to include or exclude mappings for dotted I
* and dotless i that are marked with 'T' in CaseFolding.txt.
*
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
* U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param dest A buffer for the result string. The result will be NUL-terminated if
* the buffer is large enough.
* The contents is undefined in case of failure.
* @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then
* dest may be nullptr and the function will only return the length of the result
* without writing any of the result string.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be nullptr.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful.
* When the result would be longer than destCapacity,
* the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
*
* @see u_strFoldCase
* @stable ICU 59
*/
static int32_t fold(
uint32_t options,
const char16_t *src, int32_t srcLength,
char16_t *dest, int32_t destCapacity, Edits *edits,
UErrorCode &errorCode);
/**
* Lowercases a UTF-8 string and optionally records edits.
* Casing is locale-dependent and context-sensitive.
* The result may be longer or shorter than the original.
*
* @param locale The locale ID. ("" = root locale, nullptr = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src The original string.
* @param sink A ByteSink to which the result string is written.
* sink.Flush() is called at the end.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be nullptr.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
*
* @see ucasemap_utf8ToLower
* @stable ICU 60
*/
static void utf8ToLower(
const char *locale, uint32_t options,
StringPiece src, ByteSink &sink, Edits *edits,
UErrorCode &errorCode);
/**
* Uppercases a UTF-8 string and optionally records edits.
* Casing is locale-dependent and context-sensitive.
* The result may be longer or shorter than the original.
*
* @param locale The locale ID. ("" = root locale, nullptr = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src The original string.
* @param sink A ByteSink to which the result string is written.
* sink.Flush() is called at the end.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be nullptr.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
*
* @see ucasemap_utf8ToUpper
* @stable ICU 60
*/
static void utf8ToUpper(
const char *locale, uint32_t options,
StringPiece src, ByteSink &sink, Edits *edits,
UErrorCode &errorCode);
#if !UCONFIG_NO_BREAK_ITERATION
/**
* Titlecases a UTF-8 string and optionally records edits.
* Casing is locale-dependent and context-sensitive.
* The result may be longer or shorter than the original.
*
* Titlecasing uses a break iterator to find the first characters of words
* that are to be titlecased. It titlecases those characters and lowercases
* all others. (This can be modified with options bits.)
*
* @param locale The locale ID. ("" = root locale, nullptr = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
* U_TITLECASE_NO_LOWERCASE,
* U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
* U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
* @param iter A break iterator to find the first characters of words that are to be titlecased.
* It is set to the source string (setUText())
* and used one or more times for iteration (first() and next()).
* If nullptr, then a word break iterator for the locale is used
* (or something equivalent).
* @param src The original string.
* @param sink A ByteSink to which the result string is written.
* sink.Flush() is called at the end.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be nullptr.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
*
* @see ucasemap_utf8ToTitle
* @stable ICU 60
*/
static void utf8ToTitle(
const char *locale, uint32_t options, BreakIterator *iter,
StringPiece src, ByteSink &sink, Edits *edits,
UErrorCode &errorCode);
#endif // UCONFIG_NO_BREAK_ITERATION
/**
* Case-folds a UTF-8 string and optionally records edits.
*
* Case folding is locale-independent and not context-sensitive,
* but there is an option for whether to include or exclude mappings for dotted I
* and dotless i that are marked with 'T' in CaseFolding.txt.
*
* The result may be longer or shorter than the original.
*
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src The original string.
* @param sink A ByteSink to which the result string is written.
* sink.Flush() is called at the end.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be nullptr.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
*
* @see ucasemap_utf8FoldCase
* @stable ICU 60
*/
static void utf8Fold(
uint32_t options,
StringPiece src, ByteSink &sink, Edits *edits,
UErrorCode &errorCode);
/**
* Lowercases a UTF-8 string and optionally records edits.
* Casing is locale-dependent and context-sensitive.
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* @param locale The locale ID. ("" = root locale, nullptr = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param dest A buffer for the result string. The result will be NUL-terminated if
* the buffer is large enough.
* The contents is undefined in case of failure.
* @param destCapacity The size of the buffer (number of bytes). If it is 0, then
* dest may be nullptr and the function will only return the length of the result
* without writing any of the result string.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be nullptr.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful.
* When the result would be longer than destCapacity,
* the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
*
* @see ucasemap_utf8ToLower
* @stable ICU 59
*/
static int32_t utf8ToLower(
const char *locale, uint32_t options,
const char *src, int32_t srcLength,
char *dest, int32_t destCapacity, Edits *edits,
UErrorCode &errorCode);
/**
* Uppercases a UTF-8 string and optionally records edits.
* Casing is locale-dependent and context-sensitive.
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* @param locale The locale ID. ("" = root locale, nullptr = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param dest A buffer for the result string. The result will be NUL-terminated if
* the buffer is large enough.
* The contents is undefined in case of failure.
* @param destCapacity The size of the buffer (number of bytes). If it is 0, then
* dest may be nullptr and the function will only return the length of the result
* without writing any of the result string.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be nullptr.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful.
* When the result would be longer than destCapacity,
* the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
*
* @see ucasemap_utf8ToUpper
* @stable ICU 59
*/
static int32_t utf8ToUpper(
const char *locale, uint32_t options,
const char *src, int32_t srcLength,
char *dest, int32_t destCapacity, Edits *edits,
UErrorCode &errorCode);
#if !UCONFIG_NO_BREAK_ITERATION
/**
* Titlecases a UTF-8 string and optionally records edits.
* Casing is locale-dependent and context-sensitive.
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* Titlecasing uses a break iterator to find the first characters of words
* that are to be titlecased. It titlecases those characters and lowercases
* all others. (This can be modified with options bits.)
*
* @param locale The locale ID. ("" = root locale, nullptr = default locale.)
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
* U_TITLECASE_NO_LOWERCASE,
* U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
* U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
* @param iter A break iterator to find the first characters of words that are to be titlecased.
* It is set to the source string (setUText())
* and used one or more times for iteration (first() and next()).
* If nullptr, then a word break iterator for the locale is used
* (or something equivalent).
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param dest A buffer for the result string. The result will be NUL-terminated if
* the buffer is large enough.
* The contents is undefined in case of failure.
* @param destCapacity The size of the buffer (number of bytes). If it is 0, then
* dest may be nullptr and the function will only return the length of the result
* without writing any of the result string.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be nullptr.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful.
* When the result would be longer than destCapacity,
* the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
*
* @see ucasemap_utf8ToTitle
* @stable ICU 59
*/
static int32_t utf8ToTitle(
const char *locale, uint32_t options, BreakIterator *iter,
const char *src, int32_t srcLength,
char *dest, int32_t destCapacity, Edits *edits,
UErrorCode &errorCode);
#endif // UCONFIG_NO_BREAK_ITERATION
/**
* Case-folds a UTF-8 string and optionally records edits.
*
* Case folding is locale-independent and not context-sensitive,
* but there is an option for whether to include or exclude mappings for dotted I
* and dotless i that are marked with 'T' in CaseFolding.txt.
*
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
* U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param dest A buffer for the result string. The result will be NUL-terminated if
* the buffer is large enough.
* The contents is undefined in case of failure.
* @param destCapacity The size of the buffer (number of bytes). If it is 0, then
* dest may be nullptr and the function will only return the length of the result
* without writing any of the result string.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be nullptr.
* @param errorCode Reference to an in/out error code value
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful.
* When the result would be longer than destCapacity,
* the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
*
* @see ucasemap_utf8FoldCase
* @stable ICU 59
*/
static int32_t utf8Fold(
uint32_t options,
const char *src, int32_t srcLength,
char *dest, int32_t destCapacity, Edits *edits,
UErrorCode &errorCode);
private:
CaseMap() = delete;
CaseMap(const CaseMap &other) = delete;
CaseMap &operator=(const CaseMap &other) = delete;
};
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // __CASEMAP_H__

View File

@@ -0,0 +1,453 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// char16ptr.h
// created: 2017feb28 Markus W. Scherer
#ifndef __CHAR16PTR_H__
#define __CHAR16PTR_H__
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
#include <cstddef>
#include <string_view>
#include <type_traits>
#endif
/**
* \file
* \brief C++ API: char16_t pointer wrappers with
* implicit conversion from bit-compatible raw pointer types.
* Also conversion functions from char16_t * to UChar * and OldUChar *.
*/
/**
* \def U_ALIASING_BARRIER
* Barrier for pointer anti-aliasing optimizations even across function boundaries.
* @internal
*/
#ifdef U_ALIASING_BARRIER
// Use the predefined value.
#elif (defined(__clang__) || defined(__GNUC__)) && U_PLATFORM != U_PF_BROWSER_NATIVE_CLIENT
# define U_ALIASING_BARRIER(ptr) asm volatile("" : : "rm"(ptr) : "memory")
#elif defined(U_IN_DOXYGEN)
# define U_ALIASING_BARRIER(ptr)
#endif
// ICU DLL-exported
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* char16_t * wrapper with implicit conversion from distinct but bit-compatible pointer types.
* @stable ICU 59
*/
class U_COMMON_API Char16Ptr final {
public:
/**
* Copies the pointer.
* @param p pointer
* @stable ICU 59
*/
inline Char16Ptr(char16_t *p);
#if !U_CHAR16_IS_TYPEDEF
/**
* Converts the pointer to char16_t *.
* @param p pointer to be converted
* @stable ICU 59
*/
inline Char16Ptr(uint16_t *p);
#endif
#if U_SIZEOF_WCHAR_T==2 || defined(U_IN_DOXYGEN)
/**
* Converts the pointer to char16_t *.
* (Only defined if U_SIZEOF_WCHAR_T==2.)
* @param p pointer to be converted
* @stable ICU 59
*/
inline Char16Ptr(wchar_t *p);
#endif
/**
* nullptr constructor.
* @param p nullptr
* @stable ICU 59
*/
inline Char16Ptr(std::nullptr_t p);
/**
* Destructor.
* @stable ICU 59
*/
inline ~Char16Ptr();
/**
* Pointer access.
* @return the wrapped pointer
* @stable ICU 59
*/
inline char16_t *get() const;
/**
* char16_t pointer access via type conversion (e.g., static_cast).
* @return the wrapped pointer
* @stable ICU 59
*/
inline operator char16_t *() const { return get(); }
private:
Char16Ptr() = delete;
#ifdef U_ALIASING_BARRIER
template<typename T> static char16_t *cast(T *t) {
U_ALIASING_BARRIER(t);
return reinterpret_cast<char16_t *>(t);
}
char16_t *p_;
#else
union {
char16_t *cp;
uint16_t *up;
wchar_t *wp;
} u_;
#endif
};
/// \cond
#ifdef U_ALIASING_BARRIER
Char16Ptr::Char16Ptr(char16_t *p) : p_(p) {}
#if !U_CHAR16_IS_TYPEDEF
Char16Ptr::Char16Ptr(uint16_t *p) : p_(cast(p)) {}
#endif
#if U_SIZEOF_WCHAR_T==2
Char16Ptr::Char16Ptr(wchar_t *p) : p_(cast(p)) {}
#endif
Char16Ptr::Char16Ptr(std::nullptr_t p) : p_(p) {}
Char16Ptr::~Char16Ptr() {
U_ALIASING_BARRIER(p_);
}
char16_t *Char16Ptr::get() const { return p_; }
#else
Char16Ptr::Char16Ptr(char16_t *p) { u_.cp = p; }
#if !U_CHAR16_IS_TYPEDEF
Char16Ptr::Char16Ptr(uint16_t *p) { u_.up = p; }
#endif
#if U_SIZEOF_WCHAR_T==2
Char16Ptr::Char16Ptr(wchar_t *p) { u_.wp = p; }
#endif
Char16Ptr::Char16Ptr(std::nullptr_t p) { u_.cp = p; }
Char16Ptr::~Char16Ptr() {}
char16_t *Char16Ptr::get() const { return u_.cp; }
#endif
/// \endcond
/**
* const char16_t * wrapper with implicit conversion from distinct but bit-compatible pointer types.
* @stable ICU 59
*/
class U_COMMON_API ConstChar16Ptr final {
public:
/**
* Copies the pointer.
* @param p pointer
* @stable ICU 59
*/
inline ConstChar16Ptr(const char16_t *p);
#if !U_CHAR16_IS_TYPEDEF
/**
* Converts the pointer to char16_t *.
* @param p pointer to be converted
* @stable ICU 59
*/
inline ConstChar16Ptr(const uint16_t *p);
#endif
#if U_SIZEOF_WCHAR_T==2 || defined(U_IN_DOXYGEN)
/**
* Converts the pointer to char16_t *.
* (Only defined if U_SIZEOF_WCHAR_T==2.)
* @param p pointer to be converted
* @stable ICU 59
*/
inline ConstChar16Ptr(const wchar_t *p);
#endif
/**
* nullptr constructor.
* @param p nullptr
* @stable ICU 59
*/
inline ConstChar16Ptr(const std::nullptr_t p);
/**
* Destructor.
* @stable ICU 59
*/
inline ~ConstChar16Ptr();
/**
* Pointer access.
* @return the wrapped pointer
* @stable ICU 59
*/
inline const char16_t *get() const;
/**
* char16_t pointer access via type conversion (e.g., static_cast).
* @return the wrapped pointer
* @stable ICU 59
*/
inline operator const char16_t *() const { return get(); }
private:
ConstChar16Ptr() = delete;
#ifdef U_ALIASING_BARRIER
template<typename T> static const char16_t *cast(const T *t) {
U_ALIASING_BARRIER(t);
return reinterpret_cast<const char16_t *>(t);
}
const char16_t *p_;
#else
union {
const char16_t *cp;
const uint16_t *up;
const wchar_t *wp;
} u_;
#endif
};
/// \cond
#ifdef U_ALIASING_BARRIER
ConstChar16Ptr::ConstChar16Ptr(const char16_t *p) : p_(p) {}
#if !U_CHAR16_IS_TYPEDEF
ConstChar16Ptr::ConstChar16Ptr(const uint16_t *p) : p_(cast(p)) {}
#endif
#if U_SIZEOF_WCHAR_T==2
ConstChar16Ptr::ConstChar16Ptr(const wchar_t *p) : p_(cast(p)) {}
#endif
ConstChar16Ptr::ConstChar16Ptr(const std::nullptr_t p) : p_(p) {}
ConstChar16Ptr::~ConstChar16Ptr() {
U_ALIASING_BARRIER(p_);
}
const char16_t *ConstChar16Ptr::get() const { return p_; }
#else
ConstChar16Ptr::ConstChar16Ptr(const char16_t *p) { u_.cp = p; }
#if !U_CHAR16_IS_TYPEDEF
ConstChar16Ptr::ConstChar16Ptr(const uint16_t *p) { u_.up = p; }
#endif
#if U_SIZEOF_WCHAR_T==2
ConstChar16Ptr::ConstChar16Ptr(const wchar_t *p) { u_.wp = p; }
#endif
ConstChar16Ptr::ConstChar16Ptr(const std::nullptr_t p) { u_.cp = p; }
ConstChar16Ptr::~ConstChar16Ptr() {}
const char16_t *ConstChar16Ptr::get() const { return u_.cp; }
#endif
/// \endcond
U_NAMESPACE_END
#endif // U_SHOW_CPLUSPLUS_API
// Usable in header-only definitions
#if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
namespace U_ICU_NAMESPACE_OR_INTERNAL {
#ifndef U_FORCE_HIDE_INTERNAL_API
/** @internal */
template<typename T, typename = std::enable_if_t<std::is_same_v<T, UChar>>>
inline const char16_t *uprv_char16PtrFromUChar(const T *p) {
if constexpr (std::is_same_v<UChar, char16_t>) {
return p;
} else {
#if U_SHOW_CPLUSPLUS_API
return ConstChar16Ptr(p).get();
#else
#ifdef U_ALIASING_BARRIER
U_ALIASING_BARRIER(p);
#endif
return reinterpret_cast<const char16_t *>(p);
#endif
}
}
#if !U_CHAR16_IS_TYPEDEF && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 180000)
/** @internal */
inline const char16_t *uprv_char16PtrFromUint16(const uint16_t *p) {
#if U_SHOW_CPLUSPLUS_API
return ConstChar16Ptr(p).get();
#else
#ifdef U_ALIASING_BARRIER
U_ALIASING_BARRIER(p);
#endif
return reinterpret_cast<const char16_t *>(p);
#endif
}
#endif
#if U_SIZEOF_WCHAR_T==2
/** @internal */
inline const char16_t *uprv_char16PtrFromWchar(const wchar_t *p) {
#if U_SHOW_CPLUSPLUS_API
return ConstChar16Ptr(p).get();
#else
#ifdef U_ALIASING_BARRIER
U_ALIASING_BARRIER(p);
#endif
return reinterpret_cast<const char16_t *>(p);
#endif
}
#endif
#endif
/**
* Converts from const char16_t * to const UChar *.
* Includes an aliasing barrier if available.
* @param p pointer
* @return p as const UChar *
* @stable ICU 59
*/
inline const UChar *toUCharPtr(const char16_t *p) {
#ifdef U_ALIASING_BARRIER
U_ALIASING_BARRIER(p);
#endif
return reinterpret_cast<const UChar *>(p);
}
/**
* Converts from char16_t * to UChar *.
* Includes an aliasing barrier if available.
* @param p pointer
* @return p as UChar *
* @stable ICU 59
*/
inline UChar *toUCharPtr(char16_t *p) {
#ifdef U_ALIASING_BARRIER
U_ALIASING_BARRIER(p);
#endif
return reinterpret_cast<UChar *>(p);
}
/**
* Converts from const char16_t * to const OldUChar *.
* Includes an aliasing barrier if available.
* @param p pointer
* @return p as const OldUChar *
* @stable ICU 59
*/
inline const OldUChar *toOldUCharPtr(const char16_t *p) {
#ifdef U_ALIASING_BARRIER
U_ALIASING_BARRIER(p);
#endif
return reinterpret_cast<const OldUChar *>(p);
}
/**
* Converts from char16_t * to OldUChar *.
* Includes an aliasing barrier if available.
* @param p pointer
* @return p as OldUChar *
* @stable ICU 59
*/
inline OldUChar *toOldUCharPtr(char16_t *p) {
#ifdef U_ALIASING_BARRIER
U_ALIASING_BARRIER(p);
#endif
return reinterpret_cast<OldUChar *>(p);
}
} // U_ICU_NAMESPACE_OR_INTERNAL
#endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
// ICU DLL-exported
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
#ifndef U_FORCE_HIDE_INTERNAL_API
/**
* Is T convertible to a std::u16string_view or some other 16-bit string view?
* @internal
*/
template<typename T>
constexpr bool ConvertibleToU16StringView =
std::is_convertible_v<T, std::u16string_view>
#if !U_CHAR16_IS_TYPEDEF && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 180000)
|| std::is_convertible_v<T, std::basic_string_view<uint16_t>>
#endif
#if U_SIZEOF_WCHAR_T==2
|| std::is_convertible_v<T, std::wstring_view>
#endif
;
namespace internal {
/**
* Pass-through overload.
* @internal
*/
inline std::u16string_view toU16StringView(std::u16string_view sv) { return sv; }
#if !U_CHAR16_IS_TYPEDEF && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION < 180000)
/**
* Basically undefined behavior but sometimes necessary conversion
* from std::basic_string_view<uint16_t> to std::u16string_view.
* @internal
*/
inline std::u16string_view toU16StringView(std::basic_string_view<uint16_t> sv) {
return { ConstChar16Ptr(sv.data()), sv.length() };
}
#endif
#if U_SIZEOF_WCHAR_T==2
/**
* Basically undefined behavior but sometimes necessary conversion
* from std::wstring_view to std::u16string_view.
* @internal
*/
inline std::u16string_view toU16StringView(std::wstring_view sv) {
return { ConstChar16Ptr(sv.data()), sv.length() };
}
#endif
/**
* Pass-through overload.
* @internal
*/
template <typename T,
typename = typename std::enable_if_t<!std::is_pointer_v<std::remove_reference_t<T>>>>
inline std::u16string_view toU16StringViewNullable(const T& text) {
return toU16StringView(text);
}
/**
* In case of nullptr, return an empty view.
* @internal
*/
template <typename T,
typename = typename std::enable_if_t<std::is_pointer_v<std::remove_reference_t<T>>>,
typename = void>
inline std::u16string_view toU16StringViewNullable(const T& text) {
if (text == nullptr) return {}; // For backward compatibility.
return toU16StringView(text);
}
} // internal
#endif // U_FORCE_HIDE_INTERNAL_API
U_NAMESPACE_END
#endif // U_SHOW_CPLUSPLUS_API
#endif // __CHAR16PTR_H__

View File

@@ -0,0 +1,734 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
********************************************************************
*
* Copyright (C) 1997-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*
********************************************************************
*/
#ifndef CHARITER_H
#define CHARITER_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/uobject.h"
#include "unicode/unistr.h"
/**
* \file
* \brief C++ API: Character Iterator
*/
U_NAMESPACE_BEGIN
/**
* Abstract class that defines an API for forward-only iteration
* on text objects.
* This is a minimal interface for iteration without random access
* or backwards iteration. It is especially useful for wrapping
* streams with converters into an object for collation or
* normalization.
*
* <p>Characters can be accessed in two ways: as code units or as
* code points.
* Unicode code points are 21-bit integers and are the scalar values
* of Unicode characters. ICU uses the type UChar32 for them.
* Unicode code units are the storage units of a given
* Unicode/UCS Transformation Format (a character encoding scheme).
* With UTF-16, all code points can be represented with either one
* or two code units ("surrogates").
* String storage is typically based on code units, while properties
* of characters are typically determined using code point values.
* Some processes may be designed to work with sequences of code units,
* or it may be known that all characters that are important to an
* algorithm can be represented with single code units.
* Other processes will need to use the code point access functions.</p>
*
* <p>ForwardCharacterIterator provides nextPostInc() to access
* a code unit and advance an internal position into the text object,
* similar to a <code>return text[position++]</code>.<br>
* It provides next32PostInc() to access a code point and advance an internal
* position.</p>
*
* <p>next32PostInc() assumes that the current position is that of
* the beginning of a code point, i.e., of its first code unit.
* After next32PostInc(), this will be true again.
* In general, access to code units and code points in the same
* iteration loop should not be mixed. In UTF-16, if the current position
* is on a second code unit (Low Surrogate), then only that code unit
* is returned even by next32PostInc().</p>
*
* <p>For iteration with either function, there are two ways to
* check for the end of the iteration. When there are no more
* characters in the text object:
* <ul>
* <li>The hasNext() function returns false.</li>
* <li>nextPostInc() and next32PostInc() return DONE
* when one attempts to read beyond the end of the text object.</li>
* </ul>
*
* Example:
* \code
* void function1(ForwardCharacterIterator &it) {
* UChar32 c;
* while(it.hasNext()) {
* c=it.next32PostInc();
* // use c
* }
* }
*
* void function1(ForwardCharacterIterator &it) {
* char16_t c;
* while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) {
* // use c
* }
* }
* \endcode
* </p>
*
* @stable ICU 2.0
*/
class U_COMMON_API ForwardCharacterIterator : public UObject {
public:
/**
* Value returned by most of ForwardCharacterIterator's functions
* when the iterator has reached the limits of its iteration.
* @stable ICU 2.0
*/
enum { DONE = 0xffff };
/**
* Destructor.
* @stable ICU 2.0
*/
virtual ~ForwardCharacterIterator();
/**
* Returns true when both iterators refer to the same
* character in the same character-storage object.
* @param that The ForwardCharacterIterator to be compared for equality
* @return true when both iterators refer to the same
* character in the same character-storage object
* @stable ICU 2.0
*/
virtual bool operator==(const ForwardCharacterIterator& that) const = 0;
/**
* Returns true when the iterators refer to different
* text-storage objects, or to different characters in the
* same text-storage object.
* @param that The ForwardCharacterIterator to be compared for inequality
* @return true when the iterators refer to different
* text-storage objects, or to different characters in the
* same text-storage object
* @stable ICU 2.0
*/
inline bool operator!=(const ForwardCharacterIterator& that) const;
/**
* Generates a hash code for this iterator.
* @return the hash code.
* @stable ICU 2.0
*/
virtual int32_t hashCode() const = 0;
/**
* Returns a UClassID for this ForwardCharacterIterator ("poor man's
* RTTI").<P> Despite the fact that this function is public,
* DO NOT CONSIDER IT PART OF CHARACTERITERATOR'S API!
* @return a UClassID for this ForwardCharacterIterator
* @stable ICU 2.0
*/
virtual UClassID getDynamicClassID() const override = 0;
/**
* Gets the current code unit for returning and advances to the next code unit
* in the iteration range
* (toward endIndex()). If there are
* no more code units to return, returns DONE.
* @return the current code unit.
* @stable ICU 2.0
*/
virtual char16_t nextPostInc() = 0;
/**
* Gets the current code point for returning and advances to the next code point
* in the iteration range
* (toward endIndex()). If there are
* no more code points to return, returns DONE.
* @return the current code point.
* @stable ICU 2.0
*/
virtual UChar32 next32PostInc() = 0;
/**
* Returns false if there are no more code units or code points
* at or after the current position in the iteration range.
* This is used with nextPostInc() or next32PostInc() in forward
* iteration.
* @returns false if there are no more code units or code points
* at or after the current position in the iteration range.
* @stable ICU 2.0
*/
virtual UBool hasNext() = 0;
protected:
/** Default constructor to be overridden in the implementing class. @stable ICU 2.0*/
ForwardCharacterIterator();
/** Copy constructor to be overridden in the implementing class. @stable ICU 2.0*/
ForwardCharacterIterator(const ForwardCharacterIterator &other);
/**
* Assignment operator to be overridden in the implementing class.
* @stable ICU 2.0
*/
ForwardCharacterIterator &operator=(const ForwardCharacterIterator&) { return *this; }
};
/**
* Abstract class that defines an API for iteration
* on text objects.
* This is an interface for forward and backward iteration
* and random access into a text object.
*
* <p>The API provides backward compatibility to the Java and older ICU
* CharacterIterator classes but extends them significantly:
* <ol>
* <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li>
* <li>While the old API functions provided forward iteration with
* "pre-increment" semantics, the new one also provides functions
* with "post-increment" semantics. They are more efficient and should
* be the preferred iterator functions for new implementations.
* The backward iteration always had "pre-decrement" semantics, which
* are efficient.</li>
* <li>Just like ForwardCharacterIterator, it provides access to
* both code units and code points. Code point access versions are available
* for the old and the new iteration semantics.</li>
* <li>There are new functions for setting and moving the current position
* without returning a character, for efficiency.</li>
* </ol>
*
* See ForwardCharacterIterator for examples for using the new forward iteration
* functions. For backward iteration, there is also a hasPrevious() function
* that can be used analogously to hasNext().
* The old functions work as before and are shown below.</p>
*
* <p>Examples for some of the new functions:</p>
*
* Forward iteration with hasNext():
* \code
* void forward1(CharacterIterator &it) {
* UChar32 c;
* for(it.setToStart(); it.hasNext();) {
* c=it.next32PostInc();
* // use c
* }
* }
* \endcode
* Forward iteration more similar to loops with the old forward iteration,
* showing a way to convert simple for() loops:
* \code
* void forward2(CharacterIterator &it) {
* char16_t c;
* for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) {
* // use c
* }
* }
* \endcode
* Backward iteration with setToEnd() and hasPrevious():
* \code
* void backward1(CharacterIterator &it) {
* UChar32 c;
* for(it.setToEnd(); it.hasPrevious();) {
* c=it.previous32();
* // use c
* }
* }
* \endcode
* Backward iteration with a more traditional for() loop:
* \code
* void backward2(CharacterIterator &it) {
* char16_t c;
* for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) {
* // use c
* }
* }
* \endcode
*
* Example for random access:
* \code
* void random(CharacterIterator &it) {
* // set to the third code point from the beginning
* it.move32(3, CharacterIterator::kStart);
* // get a code point from here without moving the position
* UChar32 c=it.current32();
* // get the position
* int32_t pos=it.getIndex();
* // get the previous code unit
* char16_t u=it.previous();
* // move back one more code unit
* it.move(-1, CharacterIterator::kCurrent);
* // set the position back to where it was
* // and read the same code point c and move beyond it
* it.setIndex(pos);
* if(c!=it.next32PostInc()) {
* exit(1); // CharacterIterator inconsistent
* }
* }
* \endcode
*
* <p>Examples, especially for the old API:</p>
*
* Function processing characters, in this example simple output
* <pre>
* \code
* void processChar( char16_t c )
* {
* cout << " " << c;
* }
* \endcode
* </pre>
* Traverse the text from start to finish
* <pre>
* \code
* void traverseForward(CharacterIterator& iter)
* {
* for(char16_t c = iter.first(); c != CharacterIterator::DONE; c = iter.next()) {
* processChar(c);
* }
* }
* \endcode
* </pre>
* Traverse the text backwards, from end to start
* <pre>
* \code
* void traverseBackward(CharacterIterator& iter)
* {
* for(char16_t c = iter.last(); c != CharacterIterator::DONE; c = iter.previous()) {
* processChar(c);
* }
* }
* \endcode
* </pre>
* Traverse both forward and backward from a given position in the text.
* Calls to notBoundary() in this example represents some additional stopping criteria.
* <pre>
* \code
* void traverseOut(CharacterIterator& iter, int32_t pos)
* {
* char16_t c;
* for (c = iter.setIndex(pos);
* c != CharacterIterator::DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
* c = iter.next()) {}
* int32_t end = iter.getIndex();
* for (c = iter.setIndex(pos);
* c != CharacterIterator::DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
* c = iter.previous()) {}
* int32_t start = iter.getIndex() + 1;
*
* cout << "start: " << start << " end: " << end << endl;
* for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) {
* processChar(c);
* }
* }
* \endcode
* </pre>
* Creating a StringCharacterIterator and calling the test functions
* <pre>
* \code
* void CharacterIterator_Example( void )
* {
* cout << endl << "===== CharacterIterator_Example: =====" << endl;
* UnicodeString text("Ein kleiner Satz.");
* StringCharacterIterator iterator(text);
* cout << "----- traverseForward: -----------" << endl;
* traverseForward( iterator );
* cout << endl << endl << "----- traverseBackward: ----------" << endl;
* traverseBackward( iterator );
* cout << endl << endl << "----- traverseOut: ---------------" << endl;
* traverseOut( iterator, 7 );
* cout << endl << endl << "-----" << endl;
* }
* \endcode
* </pre>
*
* @stable ICU 2.0
*/
class U_COMMON_API CharacterIterator : public ForwardCharacterIterator {
public:
/**
* Origin enumeration for the move() and move32() functions.
* @stable ICU 2.0
*/
enum EOrigin { kStart, kCurrent, kEnd };
/**
* Destructor.
* @stable ICU 2.0
*/
virtual ~CharacterIterator();
/**
* Returns a pointer to a new CharacterIterator of the same
* concrete class as this one, and referring to the same
* character in the same text-storage object as this one. The
* caller is responsible for deleting the new clone.
* @return a pointer to a new CharacterIterator
* @stable ICU 2.0
*/
virtual CharacterIterator* clone() const = 0;
/**
* Sets the iterator to refer to the first code unit in its
* iteration range, and returns that code unit.
* This can be used to begin an iteration with next().
* @return the first code unit in its iteration range.
* @stable ICU 2.0
*/
virtual char16_t first() = 0;
/**
* Sets the iterator to refer to the first code unit in its
* iteration range, returns that code unit, and moves the position
* to the second code unit. This is an alternative to setToStart()
* for forward iteration with nextPostInc().
* @return the first code unit in its iteration range.
* @stable ICU 2.0
*/
virtual char16_t firstPostInc();
/**
* Sets the iterator to refer to the first code point in its
* iteration range, and returns that code unit,
* This can be used to begin an iteration with next32().
* Note that an iteration with next32PostInc(), beginning with,
* e.g., setToStart() or firstPostInc(), is more efficient.
* @return the first code point in its iteration range.
* @stable ICU 2.0
*/
virtual UChar32 first32() = 0;
/**
* Sets the iterator to refer to the first code point in its
* iteration range, returns that code point, and moves the position
* to the second code point. This is an alternative to setToStart()
* for forward iteration with next32PostInc().
* @return the first code point in its iteration range.
* @stable ICU 2.0
*/
virtual UChar32 first32PostInc();
/**
* Sets the iterator to refer to the first code unit or code point in its
* iteration range. This can be used to begin a forward
* iteration with nextPostInc() or next32PostInc().
* @return the start position of the iteration range
* @stable ICU 2.0
*/
inline int32_t setToStart();
/**
* Sets the iterator to refer to the last code unit in its
* iteration range, and returns that code unit.
* This can be used to begin an iteration with previous().
* @return the last code unit.
* @stable ICU 2.0
*/
virtual char16_t last() = 0;
/**
* Sets the iterator to refer to the last code point in its
* iteration range, and returns that code unit.
* This can be used to begin an iteration with previous32().
* @return the last code point.
* @stable ICU 2.0
*/
virtual UChar32 last32() = 0;
/**
* Sets the iterator to the end of its iteration range, just behind
* the last code unit or code point. This can be used to begin a backward
* iteration with previous() or previous32().
* @return the end position of the iteration range
* @stable ICU 2.0
*/
inline int32_t setToEnd();
/**
* Sets the iterator to refer to the "position"-th code unit
* in the text-storage object the iterator refers to, and
* returns that code unit.
* @param position the "position"-th code unit in the text-storage object
* @return the "position"-th code unit.
* @stable ICU 2.0
*/
virtual char16_t setIndex(int32_t position) = 0;
/**
* Sets the iterator to refer to the beginning of the code point
* that contains the "position"-th code unit
* in the text-storage object the iterator refers to, and
* returns that code point.
* The current position is adjusted to the beginning of the code point
* (its first code unit).
* @param position the "position"-th code unit in the text-storage object
* @return the "position"-th code point.
* @stable ICU 2.0
*/
virtual UChar32 setIndex32(int32_t position) = 0;
/**
* Returns the code unit the iterator currently refers to.
* @return the current code unit.
* @stable ICU 2.0
*/
virtual char16_t current() const = 0;
/**
* Returns the code point the iterator currently refers to.
* @return the current code point.
* @stable ICU 2.0
*/
virtual UChar32 current32() const = 0;
/**
* Advances to the next code unit in the iteration range
* (toward endIndex()), and returns that code unit. If there are
* no more code units to return, returns DONE.
* @return the next code unit.
* @stable ICU 2.0
*/
virtual char16_t next() = 0;
/**
* Advances to the next code point in the iteration range
* (toward endIndex()), and returns that code point. If there are
* no more code points to return, returns DONE.
* Note that iteration with "pre-increment" semantics is less
* efficient than iteration with "post-increment" semantics
* that is provided by next32PostInc().
* @return the next code point.
* @stable ICU 2.0
*/
virtual UChar32 next32() = 0;
/**
* Advances to the previous code unit in the iteration range
* (toward startIndex()), and returns that code unit. If there are
* no more code units to return, returns DONE.
* @return the previous code unit.
* @stable ICU 2.0
*/
virtual char16_t previous() = 0;
/**
* Advances to the previous code point in the iteration range
* (toward startIndex()), and returns that code point. If there are
* no more code points to return, returns DONE.
* @return the previous code point.
* @stable ICU 2.0
*/
virtual UChar32 previous32() = 0;
/**
* Returns false if there are no more code units or code points
* before the current position in the iteration range.
* This is used with previous() or previous32() in backward
* iteration.
* @return false if there are no more code units or code points
* before the current position in the iteration range, return true otherwise.
* @stable ICU 2.0
*/
virtual UBool hasPrevious() = 0;
/**
* Returns the numeric index in the underlying text-storage
* object of the character returned by first(). Since it's
* possible to create an iterator that iterates across only
* part of a text-storage object, this number isn't
* necessarily 0.
* @returns the numeric index in the underlying text-storage
* object of the character returned by first().
* @stable ICU 2.0
*/
inline int32_t startIndex() const;
/**
* Returns the numeric index in the underlying text-storage
* object of the position immediately BEYOND the character
* returned by last().
* @return the numeric index in the underlying text-storage
* object of the position immediately BEYOND the character
* returned by last().
* @stable ICU 2.0
*/
inline int32_t endIndex() const;
/**
* Returns the numeric index in the underlying text-storage
* object of the character the iterator currently refers to
* (i.e., the character returned by current()).
* @return the numeric index in the text-storage object of
* the character the iterator currently refers to
* @stable ICU 2.0
*/
inline int32_t getIndex() const;
/**
* Returns the length of the entire text in the underlying
* text-storage object.
* @return the length of the entire text in the text-storage object
* @stable ICU 2.0
*/
inline int32_t getLength() const;
/**
* Moves the current position relative to the start or end of the
* iteration range, or relative to the current position itself.
* The movement is expressed in numbers of code units forward
* or backward by specifying a positive or negative delta.
* @param delta the position relative to origin. A positive delta means forward;
* a negative delta means backward.
* @param origin Origin enumeration {kStart, kCurrent, kEnd}
* @return the new position
* @stable ICU 2.0
*/
virtual int32_t move(int32_t delta, EOrigin origin) = 0;
/**
* Moves the current position relative to the start or end of the
* iteration range, or relative to the current position itself.
* The movement is expressed in numbers of code points forward
* or backward by specifying a positive or negative delta.
* @param delta the position relative to origin. A positive delta means forward;
* a negative delta means backward.
* @param origin Origin enumeration {kStart, kCurrent, kEnd}
* @return the new position
* @stable ICU 2.0
*/
#ifdef move32
// One of the system headers right now is sometimes defining a conflicting macro we don't use
#undef move32
#endif
virtual int32_t move32(int32_t delta, EOrigin origin) = 0;
/**
* Copies the text under iteration into the UnicodeString
* referred to by "result".
* @param result Receives a copy of the text under iteration.
* @stable ICU 2.0
*/
virtual void getText(UnicodeString& result) = 0;
protected:
/**
* Empty constructor.
* @stable ICU 2.0
*/
CharacterIterator();
/**
* Constructor, just setting the length field in this base class.
* @stable ICU 2.0
*/
CharacterIterator(int32_t length);
/**
* Constructor, just setting the length and position fields in this base class.
* @stable ICU 2.0
*/
CharacterIterator(int32_t length, int32_t position);
/**
* Constructor, just setting the length, start, end, and position fields in this base class.
* @stable ICU 2.0
*/
CharacterIterator(int32_t length, int32_t textBegin, int32_t textEnd, int32_t position);
/**
* Copy constructor.
*
* @param that The CharacterIterator to be copied
* @stable ICU 2.0
*/
CharacterIterator(const CharacterIterator &that);
/**
* Assignment operator. Sets this CharacterIterator to have the same behavior,
* as the one passed in.
* @param that The CharacterIterator passed in.
* @return the newly set CharacterIterator.
* @stable ICU 2.0
*/
CharacterIterator &operator=(const CharacterIterator &that);
/**
* Base class text length field.
* Necessary this for correct getText() and hashCode().
* @stable ICU 2.0
*/
int32_t textLength;
/**
* Base class field for the current position.
* @stable ICU 2.0
*/
int32_t pos;
/**
* Base class field for the start of the iteration range.
* @stable ICU 2.0
*/
int32_t begin;
/**
* Base class field for the end of the iteration range.
* @stable ICU 2.0
*/
int32_t end;
};
inline bool
ForwardCharacterIterator::operator!=(const ForwardCharacterIterator& that) const {
return !operator==(that);
}
inline int32_t
CharacterIterator::setToStart() {
return move(0, kStart);
}
inline int32_t
CharacterIterator::setToEnd() {
return move(0, kEnd);
}
inline int32_t
CharacterIterator::startIndex() const {
return begin;
}
inline int32_t
CharacterIterator::endIndex() const {
return end;
}
inline int32_t
CharacterIterator::getIndex() const {
return pos;
}
inline int32_t
CharacterIterator::getLength() const {
return textLength;
}
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif

48
thirdparty/icu4c/common/unicode/dbbi.h vendored Normal file
View File

@@ -0,0 +1,48 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 1999-2006,2013 IBM Corp. All rights reserved.
**********************************************************************
* Date Name Description
* 12/1/99 rgillam Complete port from Java.
* 01/13/2000 helena Added UErrorCode to ctors.
**********************************************************************
*/
#ifndef DBBI_H
#define DBBI_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/rbbi.h"
#if !UCONFIG_NO_BREAK_ITERATION
/**
* \file
* \brief C++ API: Dictionary Based Break Iterator
*/
U_NAMESPACE_BEGIN
#ifndef U_HIDE_DEPRECATED_API
/**
* An obsolete subclass of RuleBasedBreakIterator. Handling of dictionary-
* based break iteration has been folded into the base class. This class
* is deprecated as of ICU 3.6.
* @deprecated ICU 3.6
*/
typedef RuleBasedBreakIterator DictionaryBasedBreakIterator;
#endif /* U_HIDE_DEPRECATED_API */
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif /* U_SHOW_CPLUSPLUS_API */
#endif

View File

@@ -0,0 +1,247 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2012, International Business Machines Corporation and
* others. All Rights Reserved.
*
* FILE NAME: DOCMAIN.h
*
* Date Name Description
* 12/11/2000 Ram Creation.
*/
/**
* \file
* \brief (Non API- contains Doxygen definitions)
*
* This file contains documentation for Doxygen and does not have
* any significance with respect to C or C++ API
*/
/*! \mainpage
*
* \section API API Reference Usage
*
* <h3>C++ Programmers:</h3>
* <p>Use <a href="hierarchy.html">Class Hierarchy</a> or <a href="classes.html"> Alphabetical List </a>
* or <a href="annotated.html"> Compound List</a>
* to find the class you are interested in. For example, to find BreakIterator,
* you can go to the <a href="classes.html"> Alphabetical List</a>, then click on
* "BreakIterator". Once you are at the class, you will find an inheritance
* chart, a list of the public members, a detailed description of the class,
* then detailed member descriptions.</p>
*
* <h3>C Programmers:</h3>
* <p>Use <a href="#Module">Module List</a> or <a href="globals_u.html">File Members</a>
* to find a list of all the functions and constants.
* For example, to find BreakIterator functions you would click on
* <a href="files.html"> File List</a>,
* then find "ubrk.h" and click on it. You will find descriptions of Defines,
* Typedefs, Enumerations, and Functions, with detailed descriptions below.
* If you want to find a specific function, such as ubrk_next(), then click
* first on <a href="globals.html"> File Members</a>, then use your browser
* Find dialog to search for "ubrk_next()".</p>
*
*
* <h3>API References for Previous Releases</h3>
* <p>The API References for each release of ICU are also available as
* a zip file from the ICU
* <a href="https://icu.unicode.org/download">download page</a>.</p>
*
* <hr>
*
* <h2>Architecture (User's Guide)</h2>
* <ul>
* <li><a href="https://unicode-org.github.io/icu/userguide/">Introduction</a></li>
* <li><a href="https://unicode-org.github.io/icu/userguide/i18n">Internationalization</a></li>
* <li><a href="https://unicode-org.github.io/icu/userguide/design">Locale Model, Multithreading, Error Handling, etc.</a></li>
* <li><a href="https://unicode-org.github.io/icu/userguide/conversion">Conversion</a></li>
* </ul>
*
* <hr>
*\htmlonly <h2><a NAME="Module">Module List</a></h2> \endhtmlonly
* <table border="1" cols="3" align="center">
* <tr>
* <td><strong>Module Name</strong></td>
* <td><strong>C</strong></td>
* <td><strong>C++</strong></td>
* </tr>
* <tr>
* <td>Basic Types and Constants</td>
* <td>utypes.h</td>
* <td>utypes.h</td>
* </tr>
* <tr>
* <td>Strings and Character Iteration</td>
* <td>ustring.h, utf8.h, utf16.h, icu::StringPiece, UText, UCharIterator, icu::ByteSink</td>
* <td>icu::UnicodeString, icu::CharacterIterator, icu::Appendable, icu::StringPiece,icu::ByteSink</td>
* </tr>
* <tr>
* <td>Unicode Character<br/>Properties and Names</td>
* <td>uchar.h, uscript.h</td>
* <td>C API</td>
* </tr>
* <tr>
* <td>Sets of Unicode Code Points and Strings</td>
* <td>uset.h</td>
* <td>icu::UnicodeSet</td>
* </tr>
* <tr>
* <td>Maps from Unicode Code Points to Integer Values</td>
* <td>ucptrie.h, umutablecptrie.h</td>
* <td>C API</td>
* </tr>
* <tr>
* <td>Maps from Strings to Integer Values</td>
* <td>(no C API)</td>
* <td>icu::BytesTrie, icu::UCharsTrie</td>
* </tr>
* <tr>
* <td>Codepage Conversion</td>
* <td>ucnv.h, ucnvsel.h</td>
* <td>C API</td>
* </tr>
* <tr>
* <td>Codepage Detection</td>
* <td>ucsdet.h</td>
* <td>C API</td>
* </tr>
* <tr>
* <td>Unicode Text Compression</td>
* <td>ucnv.h<br/>(encoding name "SCSU" or "BOCU-1")</td>
* <td>C API</td>
* </tr>
* <tr>
* <td>Locales </td>
* <td>uloc.h, ulocale.h, ulocbuilder.h</a></td>
* <td>icu::Locale, icu::LocaleBuilder, icu::LocaleMatcher</td>
* </tr>
* <tr>
* <td>Resource Bundles</td>
* <td>ures.h</td>
* <td>icu::ResourceBundle</td>
* </tr>
* <tr>
* <td>Normalization</td>
* <td>unorm2.h</td>
* <td>icu::Normalizer2</td>
* </tr>
* <tr>
* <td>Calendars and Time Zones</td>
* <td>ucal.h</td>
* <td>icu::Calendar, icu::TimeZone</td>
* </tr>
* <tr>
* <td>Date and Time Formatting</td>
* <td>udat.h</td>
* <td>icu::DateFormat</td>
* </tr>
* <tr>
* <td>Relative Date and Time Formatting</td>
* <td>ureldatefmt.h</td>
* <td>icu::RelativeDateTimeFormatter</td>
* </tr>
* <tr>
* <td>Message Formatting</td>
* <td>umsg.h</td>
* <td>icu::MessageFormat</td>
* </tr>
* <tr>
* <td>Message Formatting 2<br/>(technology preview)</td>
* <td>(no C API)</td>
* <td>icu::message2::MessageFormatter</td>
* </tr>
* <tr>
* <td>List Formatting</td>
* <td>ulistformatter.h</td>
* <td>icu::ListFormatter</td>
* </tr>
* <tr>
* <td>Number Formatting<br/>(includes currency and unit formatting)</td>
* <td>unumberformatter.h, unum.h, usimplenumberformatter.h</td>
* <td>icu::number::NumberFormatter (ICU 60+) or icu::NumberFormat (older versions)<br>icu::number::SimpleNumberFormatter (ICU 73+)</td>
* </tr>
* <tr>
* <td>Number Range Formatting<br />(includes currency and unit ranges)</td>
* <td>unumberrangeformatter.h</td>
* <td>icu::number::NumberRangeFormatter</td>
* </tr>
* <tr>
* <td>Number Spellout<br/>(Rule Based Number Formatting)</td>
* <td>unum.h<br/>(use UNUM_SPELLOUT)</td>
* <td>icu::RuleBasedNumberFormat</td>
* </tr>
* <tr>
* <td>Text Transformation<br/>(Transliteration)</td>
* <td>utrans.h</td>
* <td>icu::Transliterator</td>
* </tr>
* <tr>
* <td>Bidirectional Algorithm</td>
* <td>ubidi.h, ubiditransform.h</td>
* <td>C API</td>
* </tr>
* <tr>
* <td>Arabic Shaping</td>
* <td>ushape.h</td>
* <td>C API</td>
* </tr>
* <tr>
* <td>Collation</td>
* <td>ucol.h</td>
* <td>icu::Collator</td>
* </tr>
* <tr>
* <td>String Searching</td>
* <td>usearch.h</td>
* <td>icu::StringSearch</td>
* </tr>
* <tr>
* <td>Index Characters/<br/>Bucketing for Sorted Lists</td>
* <td>(no C API)</td>
* <td>icu::AlphabeticIndex</td>
* </tr>
* <tr>
* <td>Text Boundary Analysis<br/>(Break Iteration)</td>
* <td>ubrk.h</td>
* <td>icu::BreakIterator</td>
* </tr>
* <tr>
* <td>Regular Expressions</td>
* <td>uregex.h</td>
* <td>icu::RegexPattern, icu::RegexMatcher</td>
* </tr>
* <tr>
* <td>StringPrep</td>
* <td>usprep.h</td>
* <td>C API</td>
* </tr>
* <tr>
* <td>International Domain Names in Applications:<br/>
* UTS #46 in C/C++, IDNA2003 only via C API</td>
* <td>uidna.h</td>
* <td>idna.h</td>
* </tr>
* <tr>
* <td>Identifier Spoofing & Confusability</td>
* <td>uspoof.h</td>
* <td>C API</td>
* <tr>
* <td>Universal Time Scale</td>
* <td>utmscale.h</td>
* <td>C API</td>
* </tr>
* <tr>
* <td>Paragraph Layout / Complex Text Layout</td>
* <td>playout.h</td>
* <td>icu::ParagraphLayout</td>
* </tr>
* <tr>
* <td>ICU I/O</td>
* <td>ustdio.h</td>
* <td>ustream.h</td>
* </tr>
* </table>
* <i>This main page is generated from docmain.h</i>
*/

View File

@@ -0,0 +1,163 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2008-2009, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*
* File DTINTRV.H
*
*******************************************************************************
*/
#ifndef __DTINTRV_H__
#define __DTINTRV_H__
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/uobject.h"
/**
* \file
* \brief C++ API: Date Interval data type
*/
U_NAMESPACE_BEGIN
/**
* This class represents a date interval.
* It is a pair of UDate representing from UDate 1 to UDate 2.
* @stable ICU 4.0
**/
class U_COMMON_API DateInterval : public UObject {
public:
/**
* Construct a DateInterval given a from date and a to date.
* @param fromDate The from date in date interval.
* @param toDate The to date in date interval.
* @stable ICU 4.0
*/
DateInterval(UDate fromDate, UDate toDate);
/**
* destructor
* @stable ICU 4.0
*/
virtual ~DateInterval();
/**
* Get the from date.
* @return the from date in dateInterval.
* @stable ICU 4.0
*/
inline UDate getFromDate() const;
/**
* Get the to date.
* @return the to date in dateInterval.
* @stable ICU 4.0
*/
inline UDate getToDate() const;
/**
* Return the class ID for this class. This is useful only for comparing to
* a return value from getDynamicClassID(). For example:
* <pre>
* . Base* polymorphic_pointer = createPolymorphicObject();
* . if (polymorphic_pointer->getDynamicClassID() ==
* . derived::getStaticClassID()) ...
* </pre>
* @return The class ID for all objects of this class.
* @stable ICU 4.0
*/
static UClassID U_EXPORT2 getStaticClassID();
/**
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override. This
* method is to implement a simple version of RTTI, since not all C++
* compilers support genuine RTTI. Polymorphic operator==() and clone()
* methods call this method.
*
* @return The class ID for this object. All objects of a
* given class have the same class ID. Objects of
* other classes have different class IDs.
* @stable ICU 4.0
*/
virtual UClassID getDynamicClassID() const override;
/**
* Copy constructor.
* @stable ICU 4.0
*/
DateInterval(const DateInterval& other);
/**
* Default assignment operator
* @stable ICU 4.0
*/
DateInterval& operator=(const DateInterval&);
/**
* Equality operator.
* @return true if the two DateIntervals are the same
* @stable ICU 4.0
*/
virtual bool operator==(const DateInterval& other) const;
/**
* Non-equality operator
* @return true if the two DateIntervals are not the same
* @stable ICU 4.0
*/
inline bool operator!=(const DateInterval& other) const;
/**
* clone this object.
* The caller owns the result and should delete it when done.
* @return a cloned DateInterval
* @stable ICU 4.0
*/
virtual DateInterval* clone() const;
private:
/**
* Default constructor, not implemented.
*/
DateInterval() = delete;
UDate fromDate;
UDate toDate;
} ;// end class DateInterval
inline UDate
DateInterval::getFromDate() const {
return fromDate;
}
inline UDate
DateInterval::getToDate() const {
return toDate;
}
inline bool
DateInterval::operator!=(const DateInterval& other) const {
return ( !operator==(other) );
}
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif

531
thirdparty/icu4c/common/unicode/edits.h vendored Normal file
View File

@@ -0,0 +1,531 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// edits.h
// created: 2016dec30 Markus W. Scherer
#ifndef __EDITS_H__
#define __EDITS_H__
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/uobject.h"
/**
* \file
* \brief C++ API: C++ class Edits for low-level string transformations on styled text.
*/
U_NAMESPACE_BEGIN
class UnicodeString;
/**
* Records lengths of string edits but not replacement text. Supports replacements, insertions, deletions
* in linear progression. Does not support moving/reordering of text.
*
* There are two types of edits: <em>change edits</em> and <em>no-change edits</em>. Add edits to
* instances of this class using {@link #addReplace(int32_t, int32_t)} (for change edits) and
* {@link #addUnchanged(int32_t)} (for no-change edits). Change edits are retained with full granularity,
* whereas adjacent no-change edits are always merged together. In no-change edits, there is a one-to-one
* mapping between code points in the source and destination strings.
*
* After all edits have been added, instances of this class should be considered immutable, and an
* {@link Edits::Iterator} can be used for queries.
*
* There are four flavors of Edits::Iterator:
*
* <ul>
* <li>{@link #getFineIterator()} retains full granularity of change edits.
* <li>{@link #getFineChangesIterator()} retains full granularity of change edits, and when calling
* next() on the iterator, skips over no-change edits (unchanged regions).
* <li>{@link #getCoarseIterator()} treats adjacent change edits as a single edit. (Adjacent no-change
* edits are automatically merged during the construction phase.)
* <li>{@link #getCoarseChangesIterator()} treats adjacent change edits as a single edit, and when
* calling next() on the iterator, skips over no-change edits (unchanged regions).
* </ul>
*
* For example, consider the string "abcßDeF", which case-folds to "abcssdef". This string has the
* following fine edits:
* <ul>
* <li>abc ⇨ abc (no-change)
* <li>ß ⇨ ss (change)
* <li>D ⇨ d (change)
* <li>e ⇨ e (no-change)
* <li>F ⇨ f (change)
* </ul>
* and the following coarse edits (note how adjacent change edits get merged together):
* <ul>
* <li>abc ⇨ abc (no-change)
* <li>ßD ⇨ ssd (change)
* <li>e ⇨ e (no-change)
* <li>F ⇨ f (change)
* </ul>
*
* The "fine changes" and "coarse changes" iterators will step through only the change edits when their
* `Edits::Iterator::next()` methods are called. They are identical to the non-change iterators when
* their `Edits::Iterator::findSourceIndex()` or `Edits::Iterator::findDestinationIndex()`
* methods are used to walk through the string.
*
* For examples of how to use this class, see the test `TestCaseMapEditsIteratorDocs` in
* UCharacterCaseTest.java.
*
* An Edits object tracks a separate UErrorCode, but ICU string transformation functions
* (e.g., case mapping functions) merge any such errors into their API's UErrorCode.
*
* @stable ICU 59
*/
class U_COMMON_API Edits final : public UMemory {
public:
/**
* Constructs an empty object.
* @stable ICU 59
*/
Edits() :
array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), numChanges(0),
errorCode_(U_ZERO_ERROR) {}
/**
* Copy constructor.
* @param other source edits
* @stable ICU 60
*/
Edits(const Edits &other) :
array(stackArray), capacity(STACK_CAPACITY), length(other.length),
delta(other.delta), numChanges(other.numChanges),
errorCode_(other.errorCode_) {
copyArray(other);
}
/**
* Move constructor, might leave src empty.
* This object will have the same contents that the source object had.
* @param src source edits
* @stable ICU 60
*/
Edits(Edits &&src) noexcept :
array(stackArray), capacity(STACK_CAPACITY), length(src.length),
delta(src.delta), numChanges(src.numChanges),
errorCode_(src.errorCode_) {
moveArray(src);
}
/**
* Destructor.
* @stable ICU 59
*/
~Edits();
/**
* Assignment operator.
* @param other source edits
* @return *this
* @stable ICU 60
*/
Edits &operator=(const Edits &other);
/**
* Move assignment operator, might leave src empty.
* This object will have the same contents that the source object had.
* The behavior is undefined if *this and src are the same object.
* @param src source edits
* @return *this
* @stable ICU 60
*/
Edits &operator=(Edits &&src) noexcept;
/**
* Resets the data but may not release memory.
* @stable ICU 59
*/
void reset() noexcept;
/**
* Adds a no-change edit: a record for an unchanged segment of text.
* Normally called from inside ICU string transformation functions, not user code.
* @stable ICU 59
*/
void addUnchanged(int32_t unchangedLength);
/**
* Adds a change edit: a record for a text replacement/insertion/deletion.
* Normally called from inside ICU string transformation functions, not user code.
* @stable ICU 59
*/
void addReplace(int32_t oldLength, int32_t newLength);
/**
* Sets the UErrorCode if an error occurred while recording edits.
* Preserves older error codes in the outErrorCode.
* Normally called from inside ICU string transformation functions, not user code.
* @param outErrorCode Set to an error code if it does not contain one already
* and an error occurred while recording edits.
* Otherwise unchanged.
* @return true if U_FAILURE(outErrorCode)
* @stable ICU 59
*/
UBool copyErrorTo(UErrorCode &outErrorCode) const;
/**
* How much longer is the new text compared with the old text?
* @return new length minus old length
* @stable ICU 59
*/
int32_t lengthDelta() const { return delta; }
/**
* @return true if there are any change edits
* @stable ICU 59
*/
UBool hasChanges() const { return numChanges != 0; }
/**
* @return the number of change edits
* @stable ICU 60
*/
int32_t numberOfChanges() const { return numChanges; }
/**
* Access to the list of edits.
*
* At any moment in time, an instance of this class points to a single edit: a "window" into a span
* of the source string and the corresponding span of the destination string. The source string span
* starts at {@link #sourceIndex()} and runs for {@link #oldLength()} chars; the destination string
* span starts at {@link #destinationIndex()} and runs for {@link #newLength()} chars.
*
* The iterator can be moved between edits using the `next()`, `findSourceIndex(int32_t, UErrorCode &)`,
* and `findDestinationIndex(int32_t, UErrorCode &)` methods.
* Calling any of these methods mutates the iterator to make it point to the corresponding edit.
*
* For more information, see the documentation for {@link Edits}.
*
* @see getCoarseIterator
* @see getFineIterator
* @stable ICU 59
*/
struct U_COMMON_API Iterator final : public UMemory {
/**
* Default constructor, empty iterator.
* @stable ICU 60
*/
Iterator() :
array(nullptr), index(0), length(0),
remaining(0), onlyChanges_(false), coarse(false),
dir(0), changed(false), oldLength_(0), newLength_(0),
srcIndex(0), replIndex(0), destIndex(0) {}
/**
* Copy constructor.
* @stable ICU 59
*/
Iterator(const Iterator &other) = default;
/**
* Assignment operator.
* @stable ICU 59
*/
Iterator &operator=(const Iterator &other) = default;
/**
* Advances the iterator to the next edit.
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return true if there is another edit
* @stable ICU 59
*/
UBool next(UErrorCode &errorCode) { return next(onlyChanges_, errorCode); }
/**
* Moves the iterator to the edit that contains the source index.
* The source index may be found in a no-change edit
* even if normal iteration would skip no-change edits.
* Normal iteration can continue from a found edit.
*
* The iterator state before this search logically does not matter.
* (It may affect the performance of the search.)
*
* The iterator state after this search is undefined
* if the source index is out of bounds for the source string.
*
* @param i source index
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return true if the edit for the source index was found
* @stable ICU 59
*/
UBool findSourceIndex(int32_t i, UErrorCode &errorCode) {
return findIndex(i, true, errorCode) == 0;
}
/**
* Moves the iterator to the edit that contains the destination index.
* The destination index may be found in a no-change edit
* even if normal iteration would skip no-change edits.
* Normal iteration can continue from a found edit.
*
* The iterator state before this search logically does not matter.
* (It may affect the performance of the search.)
*
* The iterator state after this search is undefined
* if the source index is out of bounds for the source string.
*
* @param i destination index
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return true if the edit for the destination index was found
* @stable ICU 60
*/
UBool findDestinationIndex(int32_t i, UErrorCode &errorCode) {
return findIndex(i, false, errorCode) == 0;
}
/**
* Computes the destination index corresponding to the given source index.
* If the source index is inside a change edit (not at its start),
* then the destination index at the end of that edit is returned,
* since there is no information about index mapping inside a change edit.
*
* (This means that indexes to the start and middle of an edit,
* for example around a grapheme cluster, are mapped to indexes
* encompassing the entire edit.
* The alternative, mapping an interior index to the start,
* would map such an interval to an empty one.)
*
* This operation will usually but not always modify this object.
* The iterator state after this search is undefined.
*
* @param i source index
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return destination index; undefined if i is not 0..string length
* @stable ICU 60
*/
int32_t destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode);
/**
* Computes the source index corresponding to the given destination index.
* If the destination index is inside a change edit (not at its start),
* then the source index at the end of that edit is returned,
* since there is no information about index mapping inside a change edit.
*
* (This means that indexes to the start and middle of an edit,
* for example around a grapheme cluster, are mapped to indexes
* encompassing the entire edit.
* The alternative, mapping an interior index to the start,
* would map such an interval to an empty one.)
*
* This operation will usually but not always modify this object.
* The iterator state after this search is undefined.
*
* @param i destination index
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return source index; undefined if i is not 0..string length
* @stable ICU 60
*/
int32_t sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode);
/**
* Returns whether the edit currently represented by the iterator is a change edit.
*
* @return true if this edit replaces oldLength() units with newLength() different ones.
* false if oldLength units remain unchanged.
* @stable ICU 59
*/
UBool hasChange() const { return changed; }
/**
* The length of the current span in the source string, which starts at {@link #sourceIndex}.
*
* @return the number of units in the original string which are replaced or remain unchanged.
* @stable ICU 59
*/
int32_t oldLength() const { return oldLength_; }
/**
* The length of the current span in the destination string, which starts at
* {@link #destinationIndex}, or in the replacement string, which starts at
* {@link #replacementIndex}.
*
* @return the number of units in the modified string, if hasChange() is true.
* Same as oldLength if hasChange() is false.
* @stable ICU 59
*/
int32_t newLength() const { return newLength_; }
/**
* The start index of the current span in the source string; the span has length
* {@link #oldLength}.
*
* @return the current index into the source string
* @stable ICU 59
*/
int32_t sourceIndex() const { return srcIndex; }
/**
* The start index of the current span in the replacement string; the span has length
* {@link #newLength}. Well-defined only if the current edit is a change edit.
*
* The *replacement string* is the concatenation of all substrings of the destination
* string corresponding to change edits.
*
* This method is intended to be used together with operations that write only replacement
* characters (e.g. operations specifying the \ref U_OMIT_UNCHANGED_TEXT option).
* The source string can then be modified in-place.
*
* @return the current index into the replacement-characters-only string,
* not counting unchanged spans
* @stable ICU 59
*/
int32_t replacementIndex() const {
// TODO: Throw an exception if we aren't in a change edit?
return replIndex;
}
/**
* The start index of the current span in the destination string; the span has length
* {@link #newLength}.
*
* @return the current index into the full destination string
* @stable ICU 59
*/
int32_t destinationIndex() const { return destIndex; }
#ifndef U_HIDE_INTERNAL_API
/**
* A string representation of the current edit represented by the iterator for debugging. You
* should not depend on the contents of the return string.
* @internal
*/
UnicodeString& toString(UnicodeString& appendTo) const;
#endif // U_HIDE_INTERNAL_API
private:
friend class Edits;
Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs);
int32_t readLength(int32_t head);
void updateNextIndexes();
void updatePreviousIndexes();
UBool noNext();
UBool next(UBool onlyChanges, UErrorCode &errorCode);
UBool previous(UErrorCode &errorCode);
/** @return -1: error or i<0; 0: found; 1: i>=string length */
int32_t findIndex(int32_t i, UBool findSource, UErrorCode &errorCode);
const uint16_t *array;
int32_t index, length;
// 0 if we are not within compressed equal-length changes.
// Otherwise the number of remaining changes, including the current one.
int32_t remaining;
UBool onlyChanges_, coarse;
int8_t dir; // iteration direction: back(<0), initial(0), forward(>0)
UBool changed;
int32_t oldLength_, newLength_;
int32_t srcIndex, replIndex, destIndex;
};
/**
* Returns an Iterator for coarse-grained change edits
* (adjacent change edits are treated as one).
* Can be used to perform simple string updates.
* Skips no-change edits.
* @return an Iterator that merges adjacent changes.
* @stable ICU 59
*/
Iterator getCoarseChangesIterator() const {
return Iterator(array, length, true, true);
}
/**
* Returns an Iterator for coarse-grained change and no-change edits
* (adjacent change edits are treated as one).
* Can be used to perform simple string updates.
* Adjacent change edits are treated as one edit.
* @return an Iterator that merges adjacent changes.
* @stable ICU 59
*/
Iterator getCoarseIterator() const {
return Iterator(array, length, false, true);
}
/**
* Returns an Iterator for fine-grained change edits
* (full granularity of change edits is retained).
* Can be used for modifying styled text.
* Skips no-change edits.
* @return an Iterator that separates adjacent changes.
* @stable ICU 59
*/
Iterator getFineChangesIterator() const {
return Iterator(array, length, true, false);
}
/**
* Returns an Iterator for fine-grained change and no-change edits
* (full granularity of change edits is retained).
* Can be used for modifying styled text.
* @return an Iterator that separates adjacent changes.
* @stable ICU 59
*/
Iterator getFineIterator() const {
return Iterator(array, length, false, false);
}
/**
* Merges the two input Edits and appends the result to this object.
*
* Consider two string transformations (for example, normalization and case mapping)
* where each records Edits in addition to writing an output string.<br>
* Edits ab reflect how substrings of input string a
* map to substrings of intermediate string b.<br>
* Edits bc reflect how substrings of intermediate string b
* map to substrings of output string c.<br>
* This function merges ab and bc such that the additional edits
* recorded in this object reflect how substrings of input string a
* map to substrings of output string c.
*
* If unrelated Edits are passed in where the output string of the first
* has a different length than the input string of the second,
* then a U_ILLEGAL_ARGUMENT_ERROR is reported.
*
* @param ab reflects how substrings of input string a
* map to substrings of intermediate string b.
* @param bc reflects how substrings of intermediate string b
* map to substrings of output string c.
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return *this, with the merged edits appended
* @stable ICU 60
*/
Edits &mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode);
private:
void releaseArray() noexcept;
Edits &copyArray(const Edits &other);
Edits &moveArray(Edits &src) noexcept;
void setLastUnit(int32_t last) { array[length - 1] = static_cast<uint16_t>(last); }
int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; }
void append(int32_t r);
UBool growArray();
static const int32_t STACK_CAPACITY = 100;
uint16_t *array;
int32_t capacity;
int32_t length;
int32_t delta;
int32_t numChanges;
UErrorCode errorCode_;
uint16_t stackArray[STACK_CAPACITY];
};
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // __EDITS_H__

View File

@@ -0,0 +1,69 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 2012,2014 International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*/
/**
* \file
* \brief C++: internal template EnumSet<>
*/
#ifndef ENUMSET_H
#define ENUMSET_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/* Can't use #ifndef U_HIDE_INTERNAL_API for the entire EnumSet class, needed in .h file declarations */
/**
* enum bitset for boolean fields. Similar to Java EnumSet<>.
* Needs to range check. Used for private instance variables.
* @internal
* \cond
*/
template<typename T, uint32_t minValue, uint32_t limitValue>
class EnumSet {
public:
inline EnumSet() : fBools(0) {}
inline EnumSet(const EnumSet<T,minValue,limitValue>& other) : fBools(other.fBools) {}
inline ~EnumSet() {}
#ifndef U_HIDE_INTERNAL_API
inline void clear() { fBools=0; }
inline void add(T toAdd) { set(toAdd, 1); }
inline void remove(T toRemove) { set(toRemove, 0); }
inline int32_t contains(T toCheck) const { return get(toCheck); }
inline void set(T toSet, int32_t v) { fBools=(fBools&(~flag(toSet)))|(v?(flag(toSet)):0); }
inline int32_t get(T toCheck) const { return (fBools & flag(toCheck))?1:0; }
inline UBool isValidEnum(T toCheck) const { return (toCheck>=minValue&&toCheck<limitValue); }
inline UBool isValidValue(int32_t v) const { return (v==0||v==1); }
inline const EnumSet<T,minValue,limitValue>& operator=(const EnumSet<T,minValue,limitValue>& other) {
fBools = other.fBools;
return *this;
}
inline uint32_t getAll() const {
return fBools;
}
#endif /* U_HIDE_INTERNAL_API */
private:
inline uint32_t flag(T toCheck) const { return (1<<(toCheck-minValue)); }
private:
uint32_t fBools;
};
/** \endcond */
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif /* ENUMSET_H */

View File

@@ -0,0 +1,144 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2009-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: errorcode.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009mar10
* created by: Markus W. Scherer
*/
#ifndef __ERRORCODE_H__
#define __ERRORCODE_H__
/**
* \file
* \brief C++ API: ErrorCode class intended to make it easier to use
* ICU C and C++ APIs from C++ user code.
*/
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/uobject.h"
U_NAMESPACE_BEGIN
/**
* Wrapper class for UErrorCode, with conversion operators for direct use
* in ICU C and C++ APIs.
* Intended to be used as a base class, where a subclass overrides
* the handleFailure() function so that it throws an exception,
* does an assert(), logs an error, etc.
* This is not an abstract base class. This class can be used and instantiated
* by itself, although it will be more useful when subclassed.
*
* Features:
* - The constructor initializes the internal UErrorCode to U_ZERO_ERROR,
* removing one common source of errors.
* - Same use in C APIs taking a UErrorCode * (pointer)
* and C++ taking UErrorCode & (reference) via conversion operators.
* - Possible automatic checking for success when it goes out of scope.
*
* Note: For automatic checking for success in the destructor, a subclass
* must implement such logic in its own destructor because the base class
* destructor cannot call a subclass function (like handleFailure()).
* The ErrorCode base class destructor does nothing.
*
* Note also: While it is possible for a destructor to throw an exception,
* it is generally unsafe to do so. This means that in a subclass the destructor
* and the handleFailure() function may need to take different actions.
*
* Sample code:
* \code
* class IcuErrorCode: public icu::ErrorCode {
* public:
* virtual ~IcuErrorCode() { // should be defined in .cpp as "key function"
* // Safe because our handleFailure() does not throw exceptions.
* if(isFailure()) { handleFailure(); }
* }
* protected:
* virtual void handleFailure() const {
* log_failure(u_errorName(errorCode));
* exit(errorCode);
* }
* };
* IcuErrorCode error_code;
* UConverter *cnv = ucnv_open("Shift-JIS", error_code);
* length = ucnv_fromUChars(dest, capacity, src, length, error_code);
* ucnv_close(cnv);
* // IcuErrorCode destructor checks for success.
* \endcode
*
* @stable ICU 4.2
*/
class U_COMMON_API ErrorCode: public UMemory {
public:
/**
* Default constructor. Initializes its UErrorCode to U_ZERO_ERROR.
* @stable ICU 4.2
*/
ErrorCode() : errorCode(U_ZERO_ERROR) {}
/** Destructor, does nothing. See class documentation for details. @stable ICU 4.2 */
virtual ~ErrorCode();
/** Conversion operator, returns a reference. @stable ICU 4.2 */
operator UErrorCode & () { return errorCode; }
/** Conversion operator, returns a pointer. @stable ICU 4.2 */
operator UErrorCode * () { return &errorCode; }
/** Tests for U_SUCCESS(). @stable ICU 4.2 */
UBool isSuccess() const { return U_SUCCESS(errorCode); }
/** Tests for U_FAILURE(). @stable ICU 4.2 */
UBool isFailure() const { return U_FAILURE(errorCode); }
/** Returns the UErrorCode value. @stable ICU 4.2 */
UErrorCode get() const { return errorCode; }
/** Sets the UErrorCode value. @stable ICU 4.2 */
void set(UErrorCode value) { errorCode=value; }
/** Returns the UErrorCode value and resets it to U_ZERO_ERROR. @stable ICU 4.2 */
UErrorCode reset();
/**
* Asserts isSuccess().
* In other words, this method checks for a failure code,
* and the base class handles it like this:
* \code
* if(isFailure()) { handleFailure(); }
* \endcode
* @stable ICU 4.4
*/
void assertSuccess() const;
/**
* Return a string for the UErrorCode value.
* The string will be the same as the name of the error code constant
* in the UErrorCode enum.
* @stable ICU 4.4
*/
const char* errorName() const;
protected:
/**
* Internal UErrorCode, accessible to subclasses.
* @stable ICU 4.2
*/
UErrorCode errorCode;
/**
* Called by assertSuccess() if isFailure() is true.
* A subclass should override this function to deal with a failure code:
* Throw an exception, log an error, terminate the program, or similar.
* @stable ICU 4.2
*/
virtual void handleFailure() const {}
};
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // __ERRORCODE_H__

View File

@@ -0,0 +1,152 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
********************************************************************************
* Copyright (C) 1997-2015, International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************************
*/
#ifndef FILTEREDBRK_H
#define FILTEREDBRK_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/brkiter.h"
#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
U_NAMESPACE_BEGIN
/**
* \file
* \brief C++ API: FilteredBreakIteratorBuilder
*/
/**
* The BreakIteratorFilter is used to modify the behavior of a BreakIterator
* by constructing a new BreakIterator which suppresses certain segment boundaries.
* See http://www.unicode.org/reports/tr35/tr35-general.html#Segmentation_Exceptions .
* For example, a typical English Sentence Break Iterator would break on the space
* in the string "Mr. Smith" (resulting in two segments),
* but with "Mr." as an exception, a filtered break iterator
* would consider the string "Mr. Smith" to be a single segment.
*
* @stable ICU 56
*/
class U_COMMON_API FilteredBreakIteratorBuilder : public UObject {
public:
/**
* destructor.
* @stable ICU 56
*/
virtual ~FilteredBreakIteratorBuilder();
/**
* Construct a FilteredBreakIteratorBuilder based on rules in a locale.
* The rules are taken from CLDR exception data for the locale,
* see http://www.unicode.org/reports/tr35/tr35-general.html#Segmentation_Exceptions
* This is the equivalent of calling createInstance(UErrorCode&)
* and then repeatedly calling addNoBreakAfter(...) with the contents
* of the CLDR exception data.
* @param where the locale.
* @param status The error code.
* @return the new builder
* @stable ICU 56
*/
static FilteredBreakIteratorBuilder *createInstance(const Locale& where, UErrorCode& status);
#ifndef U_HIDE_DEPRECATED_API
/**
* This function has been deprecated in favor of createEmptyInstance, which has
* identical behavior.
* @param status The error code.
* @return the new builder
* @deprecated ICU 60 use createEmptyInstance instead
* @see createEmptyInstance()
*/
static FilteredBreakIteratorBuilder *createInstance(UErrorCode &status);
#endif /* U_HIDE_DEPRECATED_API */
/**
* Construct an empty FilteredBreakIteratorBuilder.
* In this state, it will not suppress any segment boundaries.
* @param status The error code.
* @return the new builder
* @stable ICU 60
*/
static FilteredBreakIteratorBuilder *createEmptyInstance(UErrorCode &status);
/**
* Suppress a certain string from being the end of a segment.
* For example, suppressing "Mr.", then segments ending in "Mr." will not be returned
* by the iterator.
* @param string the string to suppress, such as "Mr."
* @param status error code
* @return returns true if the string was not present and now added,
* false if the call was a no-op because the string was already being suppressed.
* @stable ICU 56
*/
virtual UBool suppressBreakAfter(const UnicodeString& string, UErrorCode& status) = 0;
/**
* Stop suppressing a certain string from being the end of the segment.
* This function does not create any new segment boundaries, but only serves to un-do
* the effect of earlier calls to suppressBreakAfter, or to un-do the effect of
* locale data which may be suppressing certain strings.
* @param string the exception to remove
* @param status error code
* @return returns true if the string was present and now removed,
* false if the call was a no-op because the string was not being suppressed.
* @stable ICU 56
*/
virtual UBool unsuppressBreakAfter(const UnicodeString& string, UErrorCode& status) = 0;
#ifndef U_FORCE_HIDE_DEPRECATED_API
/**
* This function has been deprecated in favor of wrapIteratorWithFilter()
* The behavior is identical.
* @param adoptBreakIterator the break iterator to adopt
* @param status error code
* @return the new BreakIterator, owned by the caller.
* @deprecated ICU 60 use wrapIteratorWithFilter() instead
* @see wrapBreakIteratorWithFilter()
*/
virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status) = 0;
#endif // U_FORCE_HIDE_DEPRECATED_API
/**
* Wrap (adopt) an existing break iterator in a new filtered instance.
* The resulting BreakIterator is owned by the caller.
* The BreakIteratorFilter may be destroyed before the BreakIterator is destroyed.
* Note that the adoptBreakIterator is adopted by the new BreakIterator
* and should no longer be used by the caller.
* The FilteredBreakIteratorBuilder may be reused.
* This function is an alias for build()
* @param adoptBreakIterator the break iterator to adopt
* @param status error code
* @return the new BreakIterator, owned by the caller.
* @stable ICU 60
*/
inline BreakIterator *wrapIteratorWithFilter(BreakIterator* adoptBreakIterator, UErrorCode& status) {
return build(adoptBreakIterator, status);
}
protected:
/**
* For subclass use
* @stable ICU 56
*/
FilteredBreakIteratorBuilder();
};
U_NAMESPACE_END
#endif // #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // #ifndef FILTEREDBRK_H

View File

@@ -0,0 +1,43 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 2009-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*/
/**
* \file
* \brief C API: access to ICU Data Version number
*/
#ifndef __ICU_DATA_VER_H__
#define __ICU_DATA_VER_H__
#include "unicode/utypes.h"
/**
* @stable ICU 49
*/
#define U_ICU_VERSION_BUNDLE "icuver"
/**
* @stable ICU 49
*/
#define U_ICU_DATA_KEY "DataVersion"
/**
* Retrieves the data version from icuver and stores it in dataVersionFillin.
*
* @param dataVersionFillin icuver data version information to be filled in if not-null
* @param status stores the error code from the calls to resource bundle
*
* @stable ICU 49
*/
U_CAPI void U_EXPORT2 u_getDataVersion(UVersionInfo dataVersionFillin, UErrorCode *status);
#endif

View File

@@ -0,0 +1,391 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 2009-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*
* FILE NAME : icuplug.h
*
* Date Name Description
* 10/29/2009 sl New.
******************************************************************************
*/
/**
* \file
* \brief C API: ICU Plugin API
*
* <h2>C API: ICU Plugin API</h2>
*
* <p>C API allowing run-time loadable modules that extend or modify ICU functionality.</p>
*
* <h3>Loading and Configuration</h3>
*
* <p>At ICU startup time, the environment variable "ICU_PLUGINS" will be
* queried for a directory name. If it is not set, the preprocessor symbol
* "DEFAULT_ICU_PLUGINS" will be checked for a default value.</p>
*
* <p>Within the above-named directory, the file "icuplugins##.txt" will be
* opened, if present, where ## is the major+minor number of the currently
* running ICU (such as, 44 for ICU 4.4, thus icuplugins44.txt)</p>
*
* <p>The configuration file has this format:</p>
*
* <ul>
* <li>Hash (#) begins a comment line</li>
*
* <li>Non-comment lines have two or three components:
* LIBRARYNAME ENTRYPOINT [ CONFIGURATION .. ]</li>
*
* <li>Tabs or spaces separate the three items.</li>
*
* <li>LIBRARYNAME is the name of a shared library, either a short name if
* it is on the loader path, or a full pathname.</li>
*
* <li>ENTRYPOINT is the short (undecorated) symbol name of the plugin's
* entrypoint, as above.</li>
*
* <li>CONFIGURATION is the entire rest of the line . It's passed as-is to
* the plugin.</li>
* </ul>
*
* <p>An example configuration file is, in its entirety:</p>
*
* \code
* # this is icuplugins44.txt
* testplug.dll myPlugin hello=world
* \endcode
* <p>Plugins are categorized as "high" or "low" level. Low level are those
* which must be run BEFORE high level plugins, and before any operations
* which cause ICU to be 'initialized'. If a plugin is low level but
* causes ICU to allocate memory or become initialized, that plugin is said
* to cause a 'level change'. </p>
*
* <p>At load time, ICU first queries all plugins to determine their level,
* then loads all 'low' plugins first, and then loads all 'high' plugins.
* Plugins are otherwise loaded in the order listed in the configuration file.</p>
*
* <h3>Implementing a Plugin</h3>
* \code
* U_CAPI UPlugTokenReturn U_EXPORT2
* myPlugin (UPlugData *plug, UPlugReason reason, UErrorCode *status) {
* if(reason==UPLUG_REASON_QUERY) {
* uplug_setPlugName(plug, "Simple Plugin");
* uplug_setPlugLevel(plug, UPLUG_LEVEL_HIGH);
* } else if(reason==UPLUG_REASON_LOAD) {
* ... Set up some ICU things here....
* } else if(reason==UPLUG_REASON_UNLOAD) {
* ... unload, clean up ...
* }
* return UPLUG_TOKEN;
* }
* \endcode
*
* <p>The UPlugData* is an opaque pointer to the plugin-specific data, and is
* used in all other API calls.</p>
*
* <p>The API contract is:</p>
* <ol><li>The plugin MUST always return UPLUG_TOKEN as a return value- to
* indicate that it is a valid plugin.</li>
*
* <li>When the 'reason' parameter is set to UPLUG_REASON_QUERY, the
* plugin MUST call uplug_setPlugLevel() to indicate whether it is a high
* level or low level plugin.</li>
*
* <li>When the 'reason' parameter is UPLUG_REASON_QUERY, the plugin
* SHOULD call uplug_setPlugName to indicate a human readable plugin name.</li></ol>
*
*
* \internal ICU 4.4 Technology Preview
*/
#ifndef ICUPLUG_H
#define ICUPLUG_H
#include "unicode/utypes.h"
#if UCONFIG_ENABLE_PLUGINS || defined(U_IN_DOXYGEN)
/* === Basic types === */
#ifndef U_HIDE_INTERNAL_API
struct UPlugData;
/**
* @{
* Typedef for opaque structure passed to/from a plugin.
* Use the APIs to access it.
* @internal ICU 4.4 Technology Preview
*/
typedef struct UPlugData UPlugData;
/** @} */
/**
* Random Token to identify a valid ICU plugin. Plugins must return this
* from the entrypoint.
* @internal ICU 4.4 Technology Preview
*/
#define UPLUG_TOKEN 0x54762486
/**
* Max width of names, symbols, and configuration strings
* @internal ICU 4.4 Technology Preview
*/
#define UPLUG_NAME_MAX 100
/**
* Return value from a plugin entrypoint.
* Must always be set to UPLUG_TOKEN
* @see UPLUG_TOKEN
* @internal ICU 4.4 Technology Preview
*/
typedef uint32_t UPlugTokenReturn;
/**
* Reason code for the entrypoint's call
* @internal ICU 4.4 Technology Preview
*/
typedef enum {
UPLUG_REASON_QUERY = 0, /**< The plugin is being queried for info. **/
UPLUG_REASON_LOAD = 1, /**< The plugin is being loaded. **/
UPLUG_REASON_UNLOAD = 2, /**< The plugin is being unloaded. **/
/**
* Number of known reasons.
* @internal The numeric value may change over time, see ICU ticket #12420.
*/
UPLUG_REASON_COUNT
} UPlugReason;
/**
* Level of plugin loading
* INITIAL: UNKNOWN
* QUERY: INVALID -> { LOW | HIGH }
* ERR -> INVALID
* @internal ICU 4.4 Technology Preview
*/
typedef enum {
UPLUG_LEVEL_INVALID = 0, /**< The plugin is invalid, hasn't called uplug_setLevel, or can't load. **/
UPLUG_LEVEL_UNKNOWN = 1, /**< The plugin is waiting to be installed. **/
UPLUG_LEVEL_LOW = 2, /**< The plugin must be called before u_init completes **/
UPLUG_LEVEL_HIGH = 3, /**< The plugin can run at any time. **/
/**
* Number of known levels.
* @internal The numeric value may change over time, see ICU ticket #12420.
*/
UPLUG_LEVEL_COUNT
} UPlugLevel;
/**
* Entrypoint for an ICU plugin.
* @param plug the UPlugData handle.
* @param reason the reason code for the entrypoint's call.
* @param status Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return A valid plugin must return UPLUG_TOKEN
* @internal ICU 4.4 Technology Preview
*/
typedef UPlugTokenReturn (U_EXPORT2 UPlugEntrypoint) (
UPlugData *plug,
UPlugReason reason,
UErrorCode *status);
/* === Needed for Implementing === */
/**
* Request that this plugin not be unloaded at cleanup time.
* This is appropriate for plugins which cannot be cleaned up.
* @see u_cleanup()
* @param plug plugin
* @param dontUnload set true if this plugin can't be unloaded
* @internal ICU 4.4 Technology Preview
*/
U_CAPI void U_EXPORT2
uplug_setPlugNoUnload(UPlugData *plug, UBool dontUnload);
/**
* Set the level of this plugin.
* @param plug plugin data handle
* @param level the level of this plugin
* @internal ICU 4.4 Technology Preview
*/
U_CAPI void U_EXPORT2
uplug_setPlugLevel(UPlugData *plug, UPlugLevel level);
/**
* Get the level of this plugin.
* @param plug plugin data handle
* @return the level of this plugin
* @internal ICU 4.4 Technology Preview
*/
U_CAPI UPlugLevel U_EXPORT2
uplug_getPlugLevel(UPlugData *plug);
/**
* Get the lowest level of plug which can currently load.
* For example, if UPLUG_LEVEL_LOW is returned, then low level plugins may load
* if UPLUG_LEVEL_HIGH is returned, then only high level plugins may load.
* @return the lowest level of plug which can currently load
* @internal ICU 4.4 Technology Preview
*/
U_CAPI UPlugLevel U_EXPORT2
uplug_getCurrentLevel(void);
/**
* Get plug load status
* @return The error code of this plugin's load attempt.
* @internal ICU 4.4 Technology Preview
*/
U_CAPI UErrorCode U_EXPORT2
uplug_getPlugLoadStatus(UPlugData *plug);
/**
* Set the human-readable name of this plugin.
* @param plug plugin data handle
* @param name the name of this plugin. The first UPLUG_NAME_MAX characters willi be copied into a new buffer.
* @internal ICU 4.4 Technology Preview
*/
U_CAPI void U_EXPORT2
uplug_setPlugName(UPlugData *plug, const char *name);
/**
* Get the human-readable name of this plugin.
* @param plug plugin data handle
* @return the name of this plugin
* @internal ICU 4.4 Technology Preview
*/
U_CAPI const char * U_EXPORT2
uplug_getPlugName(UPlugData *plug);
/**
* Return the symbol name for this plugin, if known.
* @param plug plugin data handle
* @return the symbol name, or NULL
* @internal ICU 4.4 Technology Preview
*/
U_CAPI const char * U_EXPORT2
uplug_getSymbolName(UPlugData *plug);
/**
* Return the library name for this plugin, if known.
* @param plug plugin data handle
* @param status error code
* @return the library name, or NULL
* @internal ICU 4.4 Technology Preview
*/
U_CAPI const char * U_EXPORT2
uplug_getLibraryName(UPlugData *plug, UErrorCode *status);
/**
* Return the library used for this plugin, if known.
* Plugins could use this to load data out of their
* @param plug plugin data handle
* @return the library, or NULL
* @internal ICU 4.4 Technology Preview
*/
U_CAPI void * U_EXPORT2
uplug_getLibrary(UPlugData *plug);
/**
* Return the plugin-specific context data.
* @param plug plugin data handle
* @return the context, or NULL if not set
* @internal ICU 4.4 Technology Preview
*/
U_CAPI void * U_EXPORT2
uplug_getContext(UPlugData *plug);
/**
* Set the plugin-specific context data.
* @param plug plugin data handle
* @param context new context to set
* @internal ICU 4.4 Technology Preview
*/
U_CAPI void U_EXPORT2
uplug_setContext(UPlugData *plug, void *context);
/**
* Get the configuration string, if available.
* The string is in the platform default codepage.
* @param plug plugin data handle
* @return configuration string, or else null.
* @internal ICU 4.4 Technology Preview
*/
U_CAPI const char * U_EXPORT2
uplug_getConfiguration(UPlugData *plug);
/**
* Return all currently installed plugins, from newest to oldest
* Usage Example:
* \code
* UPlugData *plug = NULL;
* while(plug=uplug_nextPlug(plug)) {
* ... do something with 'plug' ...
* }
* \endcode
* Not thread safe- do not call while plugs are added or removed.
* @param prior pass in 'NULL' to get the first (most recent) plug,
* otherwise pass the value returned on a prior call to uplug_nextPlug
* @return the next oldest plugin, or NULL if no more.
* @internal ICU 4.4 Technology Preview
*/
U_CAPI UPlugData* U_EXPORT2
uplug_nextPlug(UPlugData *prior);
/**
* Inject a plugin as if it were loaded from a library.
* This is useful for testing plugins.
* Note that it will have a 'NULL' library pointer associated
* with it, and therefore no llibrary will be closed at cleanup time.
* Low level plugins may not be able to load, as ordering can't be enforced.
* @param entrypoint entrypoint to install
* @param config user specified configuration string, if available, or NULL.
* @param status error result
* @return the new UPlugData associated with this plugin, or NULL if error.
* @internal ICU 4.4 Technology Preview
*/
U_CAPI UPlugData* U_EXPORT2
uplug_loadPlugFromEntrypoint(UPlugEntrypoint *entrypoint, const char *config, UErrorCode *status);
/**
* Inject a plugin from a library, as if the information came from a config file.
* Low level plugins may not be able to load, and ordering can't be enforced.
* @param libName DLL name to load
* @param sym symbol of plugin (UPlugEntrypoint function)
* @param config configuration string, or NULL
* @param status error result
* @return the new UPlugData associated with this plugin, or NULL if error.
* @internal ICU 4.4 Technology Preview
*/
U_CAPI UPlugData* U_EXPORT2
uplug_loadPlugFromLibrary(const char *libName, const char *sym, const char *config, UErrorCode *status);
/**
* Remove a plugin.
* Will request the plugin to be unloaded, and close the library if needed
* @param plug plugin handle to close
* @param status error result
* @internal ICU 4.4 Technology Preview
*/
U_CAPI void U_EXPORT2
uplug_removePlug(UPlugData *plug, UErrorCode *status);
#endif /* U_HIDE_INTERNAL_API */
#endif /* UCONFIG_ENABLE_PLUGINS */
#endif /* _ICUPLUG */

333
thirdparty/icu4c/common/unicode/idna.h vendored Normal file
View File

@@ -0,0 +1,333 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2010-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: idna.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010mar05
* created by: Markus W. Scherer
*/
#ifndef __IDNA_H__
#define __IDNA_H__
/**
* \file
* \brief C++ API: Internationalizing Domain Names in Applications (IDNA)
*/
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#if !UCONFIG_NO_IDNA
#include "unicode/bytestream.h"
#include "unicode/stringpiece.h"
#include "unicode/uidna.h"
#include "unicode/unistr.h"
U_NAMESPACE_BEGIN
class IDNAInfo;
/**
* Abstract base class for IDNA processing.
* See http://www.unicode.org/reports/tr46/
* and http://www.ietf.org/rfc/rfc3490.txt
*
* The IDNA class is not intended for public subclassing.
*
* This C++ API currently only implements UTS #46.
* The uidna.h C API implements both UTS #46 (functions using UIDNA service object)
* and IDNA2003 (functions that do not use a service object).
* @stable ICU 4.6
*/
class U_COMMON_API IDNA : public UObject {
public:
/**
* Destructor.
* @stable ICU 4.6
*/
~IDNA();
/**
* Returns an IDNA instance which implements UTS #46.
* Returns an unmodifiable instance, owned by the caller.
* Cache it for multiple operations, and delete it when done.
* The instance is thread-safe, that is, it can be used concurrently.
*
* UTS #46 defines Unicode IDNA Compatibility Processing,
* updated to the latest version of Unicode and compatible with both
* IDNA2003 and IDNA2008.
*
* The worker functions use transitional processing, including deviation mappings,
* unless UIDNA_NONTRANSITIONAL_TO_ASCII or UIDNA_NONTRANSITIONAL_TO_UNICODE
* is used in which case the deviation characters are passed through without change.
* <b>Unicode 15.1 UTS #46 deprecated transitional processing.</b>
*
* Disallowed characters are mapped to U+FFFD.
*
* For available options see the uidna.h header.
* Operations with the UTS #46 instance do not support the
* UIDNA_ALLOW_UNASSIGNED option.
*
* By default, the UTS #46 implementation allows all ASCII characters (as valid or mapped).
* When the UIDNA_USE_STD3_RULES option is used, ASCII characters other than
* letters, digits, hyphen (LDH) and dot/full stop are disallowed and mapped to U+FFFD.
*
* @param options Bit set to modify the processing and error checking.
* These should include UIDNA_DEFAULT, or
* UIDNA_NONTRANSITIONAL_TO_ASCII | UIDNA_NONTRANSITIONAL_TO_UNICODE.
* See option bit set values in uidna.h.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the UTS #46 IDNA instance, if successful
* @stable ICU 4.6
*/
static IDNA *
createUTS46Instance(uint32_t options, UErrorCode &errorCode);
/**
* Converts a single domain name label into its ASCII form for DNS lookup.
* If any processing step fails, then info.hasErrors() will be true and
* the result might not be an ASCII string.
* The label might be modified according to the types of errors.
* Labels with severe errors will be left in (or turned into) their Unicode form.
*
* The UErrorCode indicates an error only in exceptional cases,
* such as a U_MEMORY_ALLOCATION_ERROR.
*
* @param label Input domain name label
* @param dest Destination string object
* @param info Output container of IDNA processing details.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return dest
* @stable ICU 4.6
*/
virtual UnicodeString &
labelToASCII(const UnicodeString &label, UnicodeString &dest,
IDNAInfo &info, UErrorCode &errorCode) const = 0;
/**
* Converts a single domain name label into its Unicode form for human-readable display.
* If any processing step fails, then info.hasErrors() will be true.
* The label might be modified according to the types of errors.
*
* The UErrorCode indicates an error only in exceptional cases,
* such as a U_MEMORY_ALLOCATION_ERROR.
*
* @param label Input domain name label
* @param dest Destination string object
* @param info Output container of IDNA processing details.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return dest
* @stable ICU 4.6
*/
virtual UnicodeString &
labelToUnicode(const UnicodeString &label, UnicodeString &dest,
IDNAInfo &info, UErrorCode &errorCode) const = 0;
/**
* Converts a whole domain name into its ASCII form for DNS lookup.
* If any processing step fails, then info.hasErrors() will be true and
* the result might not be an ASCII string.
* The domain name might be modified according to the types of errors.
* Labels with severe errors will be left in (or turned into) their Unicode form.
*
* The UErrorCode indicates an error only in exceptional cases,
* such as a U_MEMORY_ALLOCATION_ERROR.
*
* @param name Input domain name
* @param dest Destination string object
* @param info Output container of IDNA processing details.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return dest
* @stable ICU 4.6
*/
virtual UnicodeString &
nameToASCII(const UnicodeString &name, UnicodeString &dest,
IDNAInfo &info, UErrorCode &errorCode) const = 0;
/**
* Converts a whole domain name into its Unicode form for human-readable display.
* If any processing step fails, then info.hasErrors() will be true.
* The domain name might be modified according to the types of errors.
*
* The UErrorCode indicates an error only in exceptional cases,
* such as a U_MEMORY_ALLOCATION_ERROR.
*
* @param name Input domain name
* @param dest Destination string object
* @param info Output container of IDNA processing details.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return dest
* @stable ICU 4.6
*/
virtual UnicodeString &
nameToUnicode(const UnicodeString &name, UnicodeString &dest,
IDNAInfo &info, UErrorCode &errorCode) const = 0;
// UTF-8 versions of the processing methods ---------------------------- ***
/**
* Converts a single domain name label into its ASCII form for DNS lookup.
* UTF-8 version of labelToASCII(), same behavior.
*
* @param label Input domain name label
* @param dest Destination byte sink; Flush()ed if successful
* @param info Output container of IDNA processing details.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return dest
* @stable ICU 4.6
*/
virtual void
labelToASCII_UTF8(StringPiece label, ByteSink &dest,
IDNAInfo &info, UErrorCode &errorCode) const;
/**
* Converts a single domain name label into its Unicode form for human-readable display.
* UTF-8 version of labelToUnicode(), same behavior.
*
* @param label Input domain name label
* @param dest Destination byte sink; Flush()ed if successful
* @param info Output container of IDNA processing details.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return dest
* @stable ICU 4.6
*/
virtual void
labelToUnicodeUTF8(StringPiece label, ByteSink &dest,
IDNAInfo &info, UErrorCode &errorCode) const;
/**
* Converts a whole domain name into its ASCII form for DNS lookup.
* UTF-8 version of nameToASCII(), same behavior.
*
* @param name Input domain name
* @param dest Destination byte sink; Flush()ed if successful
* @param info Output container of IDNA processing details.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return dest
* @stable ICU 4.6
*/
virtual void
nameToASCII_UTF8(StringPiece name, ByteSink &dest,
IDNAInfo &info, UErrorCode &errorCode) const;
/**
* Converts a whole domain name into its Unicode form for human-readable display.
* UTF-8 version of nameToUnicode(), same behavior.
*
* @param name Input domain name
* @param dest Destination byte sink; Flush()ed if successful
* @param info Output container of IDNA processing details.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return dest
* @stable ICU 4.6
*/
virtual void
nameToUnicodeUTF8(StringPiece name, ByteSink &dest,
IDNAInfo &info, UErrorCode &errorCode) const;
};
class UTS46;
/**
* Output container for IDNA processing errors.
* The IDNAInfo class is not suitable for subclassing.
* @stable ICU 4.6
*/
class U_COMMON_API IDNAInfo : public UMemory {
public:
/**
* Constructor for stack allocation.
* @stable ICU 4.6
*/
IDNAInfo() : errors(0), labelErrors(0), isTransDiff(false), isBiDi(false), isOkBiDi(true) {}
/**
* Were there IDNA processing errors?
* @return true if there were processing errors
* @stable ICU 4.6
*/
UBool hasErrors() const { return errors!=0; }
/**
* Returns a bit set indicating IDNA processing errors.
* See UIDNA_ERROR_... constants in uidna.h.
* @return bit set of processing errors
* @stable ICU 4.6
*/
uint32_t getErrors() const { return errors; }
/**
* Returns true if transitional and nontransitional processing produce different results.
* This is the case when the input label or domain name contains
* one or more deviation characters outside a Punycode label (see UTS #46).
* <ul>
* <li>With nontransitional processing, such characters are
* copied to the destination string.
* <li>With transitional processing, such characters are
* mapped (sharp s/sigma) or removed (joiner/nonjoiner).
* </ul>
* @return true if transitional and nontransitional processing produce different results
* @stable ICU 4.6
*/
UBool isTransitionalDifferent() const { return isTransDiff; }
private:
friend class UTS46;
IDNAInfo(const IDNAInfo &other) = delete; // no copying
IDNAInfo &operator=(const IDNAInfo &other) = delete; // no copying
void reset() {
errors=labelErrors=0;
isTransDiff=false;
isBiDi=false;
isOkBiDi=true;
}
uint32_t errors, labelErrors;
UBool isTransDiff;
UBool isBiDi;
UBool isOkBiDi;
};
U_NAMESPACE_END
#endif // UCONFIG_NO_IDNA
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // __IDNA_H__

View File

@@ -0,0 +1,309 @@
// © 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#ifndef __LOCALEBUILDER_H__
#define __LOCALEBUILDER_H__
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/locid.h"
#include "unicode/localematcher.h"
#include "unicode/stringpiece.h"
#include "unicode/uobject.h"
/**
* \file
* \brief C++ API: Builder API for Locale
*/
U_NAMESPACE_BEGIN
class CharString;
/**
* <code>LocaleBuilder</code> is used to build instances of <code>Locale</code>
* from values configured by the setters. Unlike the <code>Locale</code>
* constructors, the <code>LocaleBuilder</code> checks if a value configured by a
* setter satisfies the syntax requirements defined by the <code>Locale</code>
* class. A <code>Locale</code> object created by a <code>LocaleBuilder</code> is
* well-formed and can be transformed to a well-formed IETF BCP 47 language tag
* without losing information.
*
* <p>The following example shows how to create a <code>Locale</code> object
* with the <code>LocaleBuilder</code>.
* <blockquote>
* <pre>
* UErrorCode status = U_ZERO_ERROR;
* Locale aLocale = LocaleBuilder()
* .setLanguage("sr")
* .setScript("Latn")
* .setRegion("RS")
* .build(status);
* if (U_SUCCESS(status)) {
* // ...
* }
* </pre>
* </blockquote>
*
* <p>LocaleBuilders can be reused; <code>clear()</code> resets all
* fields to their default values.
*
* <p>LocaleBuilder tracks errors in an internal UErrorCode. For all setters,
* except setLanguageTag and setLocale, LocaleBuilder will return immediately
* if the internal UErrorCode is in error state.
* To reset internal state and error code, call clear method.
* The setLanguageTag and setLocale method will first clear the internal
* UErrorCode, then track the error of the validation of the input parameter
* into the internal UErrorCode.
*
* @stable ICU 64
*/
class U_COMMON_API LocaleBuilder : public UObject {
public:
/**
* Constructs an empty LocaleBuilder. The default value of all
* fields, extensions, and private use information is the
* empty string.
*
* @stable ICU 64
*/
LocaleBuilder();
/**
* Destructor
* @stable ICU 64
*/
virtual ~LocaleBuilder();
/**
* Resets the <code>LocaleBuilder</code> to match the provided
* <code>locale</code>. Existing state is discarded.
*
* <p>All fields of the locale must be well-formed.
* <p>This method clears the internal UErrorCode.
*
* @param locale the locale
* @return This builder.
*
* @stable ICU 64
*/
LocaleBuilder& setLocale(const Locale& locale);
/**
* Resets the LocaleBuilder to match the provided IETF BCP 47 language tag.
* Discards the existing state.
* The empty string causes the builder to be reset, like {@link #clear}.
* Legacy language tags (marked as “Type: grandfathered” in BCP 47)
* are converted to their canonical form before being processed.
* Otherwise, the <code>language tag</code> must be well-formed,
* or else the build() method will later report an U_ILLEGAL_ARGUMENT_ERROR.
*
* <p>This method clears the internal UErrorCode.
*
* @param tag the language tag, defined as IETF BCP 47 language tag.
* @return This builder.
* @stable ICU 64
*/
LocaleBuilder& setLanguageTag(StringPiece tag);
/**
* Sets the language. If <code>language</code> is the empty string, the
* language in this <code>LocaleBuilder</code> is removed. Otherwise, the
* <code>language</code> must be well-formed, or else the build() method will
* later report an U_ILLEGAL_ARGUMENT_ERROR.
*
* <p>The syntax of language value is defined as
* [unicode_language_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_language_subtag).
*
* @param language the language
* @return This builder.
* @stable ICU 64
*/
LocaleBuilder& setLanguage(StringPiece language);
/**
* Sets the script. If <code>script</code> is the empty string, the script in
* this <code>LocaleBuilder</code> is removed.
* Otherwise, the <code>script</code> must be well-formed, or else the build()
* method will later report an U_ILLEGAL_ARGUMENT_ERROR.
*
* <p>The script value is a four-letter script code as
* [unicode_script_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_script_subtag)
* defined by ISO 15924
*
* @param script the script
* @return This builder.
* @stable ICU 64
*/
LocaleBuilder& setScript(StringPiece script);
/**
* Sets the region. If region is the empty string, the region in this
* <code>LocaleBuilder</code> is removed. Otherwise, the <code>region</code>
* must be well-formed, or else the build() method will later report an
* U_ILLEGAL_ARGUMENT_ERROR.
*
* <p>The region value is defined by
* [unicode_region_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_region_subtag)
* as a two-letter ISO 3166 code or a three-digit UN M.49 area code.
*
* <p>The region value in the <code>Locale</code> created by the
* <code>LocaleBuilder</code> is always normalized to upper case.
*
* @param region the region
* @return This builder.
* @stable ICU 64
*/
LocaleBuilder& setRegion(StringPiece region);
/**
* Sets the variant. If variant is the empty string, the variant in this
* <code>LocaleBuilder</code> is removed. Otherwise, the <code>variant</code>
* must be well-formed, or else the build() method will later report an
* U_ILLEGAL_ARGUMENT_ERROR.
*
* <p><b>Note:</b> This method checks if <code>variant</code>
* satisfies the
* [unicode_variant_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_variant_subtag)
* syntax requirements, and normalizes the value to lowercase letters. However,
* the <code>Locale</code> class does not impose any syntactic
* restriction on variant. To set an ill-formed variant, use a Locale constructor.
* If there are multiple unicode_variant_subtag, the caller must concatenate
* them with '-' as separator (ex: "foobar-fibar").
*
* @param variant the variant
* @return This builder.
* @stable ICU 64
*/
LocaleBuilder& setVariant(StringPiece variant);
/**
* Sets the extension for the given key. If the value is the empty string,
* the extension is removed. Otherwise, the <code>key</code> and
* <code>value</code> must be well-formed, or else the build() method will
* later report an U_ILLEGAL_ARGUMENT_ERROR.
*
* <p><b>Note:</b> The key ('u') is used for the Unicode locale extension.
* Setting a value for this key replaces any existing Unicode locale key/type
* pairs with those defined in the extension.
*
* <p><b>Note:</b> The key ('x') is used for the private use code. To be
* well-formed, the value for this key needs only to have subtags of one to
* eight alphanumeric characters, not two to eight as in the general case.
*
* @param key the extension key
* @param value the extension value
* @return This builder.
* @stable ICU 64
*/
LocaleBuilder& setExtension(char key, StringPiece value);
/**
* Sets the Unicode locale keyword type for the given key. If the type
* StringPiece is constructed with a nullptr, the keyword is removed.
* If the type is the empty string, the keyword is set without type subtags.
* Otherwise, the key and type must be well-formed, or else the build()
* method will later report an U_ILLEGAL_ARGUMENT_ERROR.
*
* <p>Keys and types are converted to lower case.
*
* <p><b>Note</b>:Setting the 'u' extension via {@link #setExtension}
* replaces all Unicode locale keywords with those defined in the
* extension.
*
* @param key the Unicode locale key
* @param type the Unicode locale type
* @return This builder.
* @stable ICU 64
*/
LocaleBuilder& setUnicodeLocaleKeyword(
StringPiece key, StringPiece type);
/**
* Adds a unicode locale attribute, if not already present, otherwise
* has no effect. The attribute must not be empty string and must be
* well-formed or U_ILLEGAL_ARGUMENT_ERROR will be set to status
* during the build() call.
*
* @param attribute the attribute
* @return This builder.
* @stable ICU 64
*/
LocaleBuilder& addUnicodeLocaleAttribute(StringPiece attribute);
/**
* Removes a unicode locale attribute, if present, otherwise has no
* effect. The attribute must not be empty string and must be well-formed
* or U_ILLEGAL_ARGUMENT_ERROR will be set to status during the build() call.
*
* <p>Attribute comparison for removal is case-insensitive.
*
* @param attribute the attribute
* @return This builder.
* @stable ICU 64
*/
LocaleBuilder& removeUnicodeLocaleAttribute(StringPiece attribute);
/**
* Resets the builder to its initial, empty state.
* <p>This method clears the internal UErrorCode.
*
* @return this builder
* @stable ICU 64
*/
LocaleBuilder& clear();
/**
* Resets the extensions to their initial, empty state.
* Language, script, region and variant are unchanged.
*
* @return this builder
* @stable ICU 64
*/
LocaleBuilder& clearExtensions();
/**
* Returns an instance of <code>Locale</code> created from the fields set
* on this builder.
* If any set methods or during the build() call require memory allocation
* but fail U_MEMORY_ALLOCATION_ERROR will be set to status.
* If any of the fields set by the setters are not well-formed, the status
* will be set to U_ILLEGAL_ARGUMENT_ERROR. The state of the builder will
* not change after the build() call and the caller is free to keep using
* the same builder to build more locales.
*
* @return a new Locale
* @stable ICU 64
*/
Locale build(UErrorCode& status);
/**
* Sets the UErrorCode if an error occurred while recording sets.
* Preserves older error codes in the outErrorCode.
* @param outErrorCode Set to an error code that occurred while setting subtags.
* Unchanged if there is no such error or if outErrorCode
* already contained an error.
* @return true if U_FAILURE(outErrorCode)
* @stable ICU 65
*/
UBool copyErrorTo(UErrorCode &outErrorCode) const;
private:
friend class LocaleMatcher::Result;
void copyExtensionsFrom(const Locale& src, UErrorCode& errorCode);
UErrorCode status_;
char language_[9];
char script_[5];
char region_[4];
CharString *variant_; // Pointer not object so we need not #include internal charstr.h.
icu::Locale *extensions_; // Pointer not object. Storage for all other fields.
};
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // __LOCALEBUILDER_H__

View File

@@ -0,0 +1,710 @@
// © 2019 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// localematcher.h
// created: 2019may08 Markus W. Scherer
#ifndef __LOCALEMATCHER_H__
#define __LOCALEMATCHER_H__
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include <optional>
#include "unicode/locid.h"
#include "unicode/stringpiece.h"
#include "unicode/uobject.h"
/**
* \file
* \brief C++ API: Locale matcher: User's desired locales vs. application's supported locales.
*/
/**
* Builder option for whether the language subtag or the script subtag is most important.
*
* @see LocaleMatcher::Builder#setFavorSubtag(ULocMatchFavorSubtag)
* @stable ICU 65
*/
enum ULocMatchFavorSubtag {
/**
* Language differences are most important, then script differences, then region differences.
* (This is the default behavior.)
*
* @stable ICU 65
*/
ULOCMATCH_FAVOR_LANGUAGE,
/**
* Makes script differences matter relatively more than language differences.
*
* @stable ICU 65
*/
ULOCMATCH_FAVOR_SCRIPT
};
#ifndef U_IN_DOXYGEN
typedef enum ULocMatchFavorSubtag ULocMatchFavorSubtag;
#endif
/**
* Builder option for whether all desired locales are treated equally or
* earlier ones are preferred.
*
* @see LocaleMatcher::Builder#setDemotionPerDesiredLocale(ULocMatchDemotion)
* @stable ICU 65
*/
enum ULocMatchDemotion {
/**
* All desired locales are treated equally.
*
* @stable ICU 65
*/
ULOCMATCH_DEMOTION_NONE,
/**
* Earlier desired locales are preferred.
*
* <p>From each desired locale to the next,
* the distance to any supported locale is increased by an additional amount
* which is at least as large as most region mismatches.
* A later desired locale has to have a better match with some supported locale
* due to more than merely having the same region subtag.
*
* <p>For example: <code>Supported={en, sv} desired=[en-GB, sv]</code>
* yields <code>Result(en-GB, en)</code> because
* with the demotion of sv its perfect match is no better than
* the region distance between the earlier desired locale en-GB and en=en-US.
*
* <p>Notes:
* <ul>
* <li>In some cases, language and/or script differences can be as small as
* the typical region difference. (Example: sr-Latn vs. sr-Cyrl)
* <li>It is possible for certain region differences to be larger than usual,
* and larger than the demotion.
* (As of CLDR 35 there is no such case, but
* this is possible in future versions of the data.)
* </ul>
*
* @stable ICU 65
*/
ULOCMATCH_DEMOTION_REGION
};
#ifndef U_IN_DOXYGEN
typedef enum ULocMatchDemotion ULocMatchDemotion;
#endif
/**
* Builder option for whether to include or ignore one-way (fallback) match data.
* The LocaleMatcher uses CLDR languageMatch data which includes fallback (oneway=true) entries.
* Sometimes it is desirable to ignore those.
*
* <p>For example, consider a web application with the UI in a given language,
* with a link to another, related web app.
* The link should include the UI language, and the target server may also use
* the clients Accept-Language header data.
* The target server has its own list of supported languages.
* One may want to favor UI language consistency, that is,
* if there is a decent match for the original UI language, we want to use it,
* but not if it is merely a fallback.
*
* @see LocaleMatcher::Builder#setDirection(ULocMatchDirection)
* @stable ICU 67
*/
enum ULocMatchDirection {
/**
* Locale matching includes one-way matches such as Breton→French. (default)
*
* @stable ICU 67
*/
ULOCMATCH_DIRECTION_WITH_ONE_WAY,
/**
* Locale matching limited to two-way matches including e.g. Danish↔Norwegian
* but ignoring one-way matches.
*
* @stable ICU 67
*/
ULOCMATCH_DIRECTION_ONLY_TWO_WAY
};
#ifndef U_IN_DOXYGEN
typedef enum ULocMatchDirection ULocMatchDirection;
#endif
struct UHashtable;
U_NAMESPACE_BEGIN
struct LSR;
class LikelySubtags;
class LocaleDistance;
class LocaleLsrIterator;
class UVector;
/**
* Immutable class that picks the best match between a user's desired locales and
* an application's supported locales.
* Movable but not copyable.
*
* <p>Example:
* <pre>
* UErrorCode errorCode = U_ZERO_ERROR;
* LocaleMatcher matcher = LocaleMatcher::Builder().setSupportedLocales("fr, en-GB, en").build(errorCode);
* Locale *bestSupported = matcher.getBestLocale(Locale.US, errorCode); // "en"
* </pre>
*
* <p>A matcher takes into account when languages are close to one another,
* such as Danish and Norwegian,
* and when regional variants are close, like en-GB and en-AU as opposed to en-US.
*
* <p>If there are multiple supported locales with the same (language, script, region)
* likely subtags, then the current implementation returns the first of those locales.
* It ignores variant subtags (except for pseudolocale variants) and extensions.
* This may change in future versions.
*
* <p>For example, the current implementation does not distinguish between
* de, de-DE, de-Latn, de-1901, de-u-co-phonebk.
*
* <p>If you prefer one equivalent locale over another, then provide only the preferred one,
* or place it earlier in the list of supported locales.
*
* <p>Otherwise, the order of supported locales may have no effect on the best-match results.
* The current implementation compares each desired locale with supported locales
* in the following order:
* 1. Default locale, if supported;
* 2. CLDR "paradigm locales" like en-GB and es-419;
* 3. other supported locales.
* This may change in future versions.
*
* <p>Often a product will just need one matcher instance, built with the languages
* that it supports. However, it may want multiple instances with different
* default languages based on additional information, such as the domain.
*
* <p>This class is not intended for public subclassing.
*
* @stable ICU 65
*/
class U_COMMON_API LocaleMatcher : public UMemory {
public:
/**
* Data for the best-matching pair of a desired and a supported locale.
* Movable but not copyable.
*
* @stable ICU 65
*/
class U_COMMON_API Result : public UMemory {
public:
/**
* Move constructor; might modify the source.
* This object will have the same contents that the source object had.
*
* @param src Result to move contents from.
* @stable ICU 65
*/
Result(Result &&src) noexcept;
/**
* Destructor.
*
* @stable ICU 65
*/
~Result();
/**
* Move assignment; might modify the source.
* This object will have the same contents that the source object had.
*
* @param src Result to move contents from.
* @stable ICU 65
*/
Result &operator=(Result &&src) noexcept;
/**
* Returns the best-matching desired locale.
* nullptr if the list of desired locales is empty or if none matched well enough.
*
* @return the best-matching desired locale, or nullptr.
* @stable ICU 65
*/
inline const Locale *getDesiredLocale() const { return desiredLocale; }
/**
* Returns the best-matching supported locale.
* If none matched well enough, this is the default locale.
* The default locale is nullptr if Builder::setNoDefaultLocale() was called,
* or if the list of supported locales is empty and no explicit default locale is set.
*
* @return the best-matching supported locale, or nullptr.
* @stable ICU 65
*/
inline const Locale *getSupportedLocale() const { return supportedLocale; }
/**
* Returns the index of the best-matching desired locale in the input Iterable order.
* -1 if the list of desired locales is empty or if none matched well enough.
*
* @return the index of the best-matching desired locale, or -1.
* @stable ICU 65
*/
inline int32_t getDesiredIndex() const { return desiredIndex; }
/**
* Returns the index of the best-matching supported locale in the
* constructors or builders input order (“set” Collection plus “added” locales).
* If the matcher was built from a locale list string, then the iteration order is that
* of a LocalePriorityList built from the same string.
* -1 if the list of supported locales is empty or if none matched well enough.
*
* @return the index of the best-matching supported locale, or -1.
* @stable ICU 65
*/
inline int32_t getSupportedIndex() const { return supportedIndex; }
/**
* Takes the best-matching supported locale and adds relevant fields of the
* best-matching desired locale, such as the -t- and -u- extensions.
* May replace some fields of the supported locale.
* The result is the locale that should be used for date and number formatting, collation, etc.
* Returns the root locale if getSupportedLocale() returns nullptr.
*
* <p>Example: desired=ar-SA-u-nu-latn, supported=ar-EG, resolved locale=ar-SA-u-nu-latn
*
* @return a locale combining the best-matching desired and supported locales.
* @stable ICU 65
*/
Locale makeResolvedLocale(UErrorCode &errorCode) const;
private:
Result(const Locale *desired, const Locale *supported,
int32_t desIndex, int32_t suppIndex, UBool owned) :
desiredLocale(desired), supportedLocale(supported),
desiredIndex(desIndex), supportedIndex(suppIndex),
desiredIsOwned(owned) {}
Result(const Result &other) = delete;
Result &operator=(const Result &other) = delete;
const Locale *desiredLocale;
const Locale *supportedLocale;
int32_t desiredIndex;
int32_t supportedIndex;
UBool desiredIsOwned;
friend class LocaleMatcher;
};
/**
* LocaleMatcher builder.
* Movable but not copyable.
*
* @stable ICU 65
*/
class U_COMMON_API Builder : public UMemory {
public:
/**
* Constructs a builder used in chaining parameters for building a LocaleMatcher.
*
* @return a new Builder object
* @stable ICU 65
*/
Builder() {}
/**
* Move constructor; might modify the source.
* This builder will have the same contents that the source builder had.
*
* @param src Builder to move contents from.
* @stable ICU 65
*/
Builder(Builder &&src) noexcept;
/**
* Destructor.
*
* @stable ICU 65
*/
~Builder();
/**
* Move assignment; might modify the source.
* This builder will have the same contents that the source builder had.
*
* @param src Builder to move contents from.
* @stable ICU 65
*/
Builder &operator=(Builder &&src) noexcept;
/**
* Parses an Accept-Language string
* (<a href="https://tools.ietf.org/html/rfc2616#section-14.4">RFC 2616 Section 14.4</a>),
* such as "af, en, fr;q=0.9", and sets the supported locales accordingly.
* Allows whitespace in more places but does not allow "*".
* Clears any previously set/added supported locales first.
*
* @param locales the Accept-Language string of locales to set
* @return this Builder object
* @stable ICU 65
*/
Builder &setSupportedLocalesFromListString(StringPiece locales);
/**
* Copies the supported locales, preserving iteration order.
* Clears any previously set/added supported locales first.
* Duplicates are allowed, and are not removed.
*
* @param locales the list of locale
* @return this Builder object
* @stable ICU 65
*/
Builder &setSupportedLocales(Locale::Iterator &locales);
/**
* Copies the supported locales from the begin/end range, preserving iteration order.
* Clears any previously set/added supported locales first.
* Duplicates are allowed, and are not removed.
*
* Each of the iterator parameter values must be an
* input iterator whose value is convertible to const Locale &.
*
* @param begin Start of range.
* @param end Exclusive end of range.
* @return this Builder object
* @stable ICU 65
*/
template<typename Iter>
Builder &setSupportedLocales(Iter begin, Iter end) {
if (U_FAILURE(errorCode_)) { return *this; }
clearSupportedLocales();
while (begin != end) {
addSupportedLocale(*begin++);
}
return *this;
}
/**
* Copies the supported locales from the begin/end range, preserving iteration order.
* Calls the converter to convert each *begin to a Locale or const Locale &.
* Clears any previously set/added supported locales first.
* Duplicates are allowed, and are not removed.
*
* Each of the iterator parameter values must be an
* input iterator whose value is convertible to const Locale &.
*
* @param begin Start of range.
* @param end Exclusive end of range.
* @param converter Converter from *begin to const Locale & or compatible.
* @return this Builder object
* @stable ICU 65
*/
template<typename Iter, typename Conv>
Builder &setSupportedLocalesViaConverter(Iter begin, Iter end, Conv converter) {
if (U_FAILURE(errorCode_)) { return *this; }
clearSupportedLocales();
while (begin != end) {
addSupportedLocale(converter(*begin++));
}
return *this;
}
/**
* Adds another supported locale.
* Duplicates are allowed, and are not removed.
*
* @param locale another locale
* @return this Builder object
* @stable ICU 65
*/
Builder &addSupportedLocale(const Locale &locale);
/**
* Sets no default locale.
* There will be no explicit or implicit default locale.
* If there is no good match, then the matcher will return nullptr for the
* best supported locale.
*
* @stable ICU 68
*/
Builder &setNoDefaultLocale();
/**
* Sets the default locale; if nullptr, or if it is not set explicitly,
* then the first supported locale is used as the default locale.
* There is no default locale at all (nullptr will be returned instead)
* if setNoDefaultLocale() is called.
*
* @param defaultLocale the default locale (will be copied)
* @return this Builder object
* @stable ICU 65
*/
Builder &setDefaultLocale(const Locale *defaultLocale);
/**
* If ULOCMATCH_FAVOR_SCRIPT, then the language differences are smaller than script
* differences.
* This is used in situations (such as maps) where
* it is better to fall back to the same script than a similar language.
*
* @param subtag the subtag to favor
* @return this Builder object
* @stable ICU 65
*/
Builder &setFavorSubtag(ULocMatchFavorSubtag subtag);
/**
* Option for whether all desired locales are treated equally or
* earlier ones are preferred (this is the default).
*
* @param demotion the demotion per desired locale to set.
* @return this Builder object
* @stable ICU 65
*/
Builder &setDemotionPerDesiredLocale(ULocMatchDemotion demotion);
/**
* Option for whether to include or ignore one-way (fallback) match data.
* By default, they are included.
*
* @param matchDirection the match direction to set.
* @return this Builder object
* @stable ICU 67
*/
Builder &setDirection(ULocMatchDirection matchDirection) {
if (U_SUCCESS(errorCode_)) {
direction_ = matchDirection;
}
return *this;
}
/**
* Sets the maximum distance for an acceptable match.
* The matcher will return a match for a pair of locales only if
* they match at least as well as the pair given here.
*
* For example, setMaxDistance(en-US, en-GB) limits matches to ones where the
* (desired, support) locales have a distance no greater than a region subtag difference.
* This is much stricter than the CLDR default.
*
* The details of locale matching are subject to changes in
* CLDR data and in the algorithm.
* Specifying a maximum distance in relative terms via a sample pair of locales
* insulates from changes that affect all distance metrics similarly,
* but some changes will necessarily affect relative distances between
* different pairs of locales.
*
* @param desired the desired locale for distance comparison.
* @param supported the supported locale for distance comparison.
* @return this Builder object
* @stable ICU 68
*/
Builder &setMaxDistance(const Locale &desired, const Locale &supported);
/**
* Sets the UErrorCode if an error occurred while setting parameters.
* Preserves older error codes in the outErrorCode.
*
* @param outErrorCode Set to an error code if it does not contain one already
* and an error occurred while setting parameters.
* Otherwise unchanged.
* @return true if U_FAILURE(outErrorCode)
* @stable ICU 65
*/
UBool copyErrorTo(UErrorCode &outErrorCode) const;
/**
* Builds and returns a new locale matcher.
* This builder can continue to be used.
*
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return LocaleMatcher
* @stable ICU 65
*/
LocaleMatcher build(UErrorCode &errorCode) const;
private:
friend class LocaleMatcher;
Builder(const Builder &other) = delete;
Builder &operator=(const Builder &other) = delete;
void clearSupportedLocales();
bool ensureSupportedLocaleVector();
UErrorCode errorCode_ = U_ZERO_ERROR;
UVector *supportedLocales_ = nullptr;
int32_t thresholdDistance_ = -1;
ULocMatchDemotion demotion_ = ULOCMATCH_DEMOTION_REGION;
Locale *defaultLocale_ = nullptr;
bool withDefault_ = true;
ULocMatchFavorSubtag favor_ = ULOCMATCH_FAVOR_LANGUAGE;
ULocMatchDirection direction_ = ULOCMATCH_DIRECTION_WITH_ONE_WAY;
Locale *maxDistanceDesired_ = nullptr;
Locale *maxDistanceSupported_ = nullptr;
};
// FYI No public LocaleMatcher constructors in C++; use the Builder.
/**
* Move copy constructor; might modify the source.
* This matcher will have the same settings that the source matcher had.
* @param src source matcher
* @stable ICU 65
*/
LocaleMatcher(LocaleMatcher &&src) noexcept;
/**
* Destructor.
* @stable ICU 65
*/
~LocaleMatcher();
/**
* Move assignment operator; might modify the source.
* This matcher will have the same settings that the source matcher had.
* The behavior is undefined if *this and src are the same object.
* @param src source matcher
* @return *this
* @stable ICU 65
*/
LocaleMatcher &operator=(LocaleMatcher &&src) noexcept;
/**
* Returns the supported locale which best matches the desired locale.
*
* @param desiredLocale Typically a user's language.
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return the best-matching supported locale.
* @stable ICU 65
*/
const Locale *getBestMatch(const Locale &desiredLocale, UErrorCode &errorCode) const;
/**
* Returns the supported locale which best matches one of the desired locales.
*
* @param desiredLocales Typically a user's languages, in order of preference (descending).
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return the best-matching supported locale.
* @stable ICU 65
*/
const Locale *getBestMatch(Locale::Iterator &desiredLocales, UErrorCode &errorCode) const;
/**
* Parses an Accept-Language string
* (<a href="https://tools.ietf.org/html/rfc2616#section-14.4">RFC 2616 Section 14.4</a>),
* such as "af, en, fr;q=0.9",
* and returns the supported locale which best matches one of the desired locales.
* Allows whitespace in more places but does not allow "*".
*
* @param desiredLocaleList Typically a user's languages, as an Accept-Language string.
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return the best-matching supported locale.
* @stable ICU 65
*/
const Locale *getBestMatchForListString(StringPiece desiredLocaleList, UErrorCode &errorCode) const;
/**
* Returns the best match between the desired locale and the supported locales.
* If the result's desired locale is not nullptr, then it is the address of the input locale.
* It has not been cloned.
*
* @param desiredLocale Typically a user's language.
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return the best-matching pair of the desired and a supported locale.
* @stable ICU 65
*/
Result getBestMatchResult(const Locale &desiredLocale, UErrorCode &errorCode) const;
/**
* Returns the best match between the desired and supported locales.
* If the result's desired locale is not nullptr, then it is a clone of
* the best-matching desired locale. The Result object owns the clone.
*
* @param desiredLocales Typically a user's languages, in order of preference (descending).
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return the best-matching pair of a desired and a supported locale.
* @stable ICU 65
*/
Result getBestMatchResult(Locale::Iterator &desiredLocales, UErrorCode &errorCode) const;
/**
* Returns true if the pair of locales matches acceptably.
* This is influenced by Builder options such as setDirection(), setFavorSubtag(),
* and setMaxDistance().
*
* @param desired The desired locale.
* @param supported The supported locale.
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return true if the pair of locales matches acceptably.
* @stable ICU 68
*/
UBool isMatch(const Locale &desired, const Locale &supported, UErrorCode &errorCode) const;
#ifndef U_HIDE_INTERNAL_API
/**
* Returns a fraction between 0 and 1, where 1 means that the languages are a
* perfect match, and 0 means that they are completely different.
*
* <p>This is mostly an implementation detail, and the precise values may change over time.
* The implementation may use either the maximized forms or the others ones, or both.
* The implementation may or may not rely on the forms to be consistent with each other.
*
* <p>Callers should construct and use a matcher rather than match pairs of locales directly.
*
* @param desired Desired locale.
* @param supported Supported locale.
* @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
* or else the function returns immediately. Check for U_FAILURE()
* on output or use with function chaining. (See User Guide for details.)
* @return value between 0 and 1, inclusive.
* @internal (has a known user)
*/
double internalMatch(const Locale &desired, const Locale &supported, UErrorCode &errorCode) const;
#endif // U_HIDE_INTERNAL_API
private:
LocaleMatcher(const Builder &builder, UErrorCode &errorCode);
LocaleMatcher(const LocaleMatcher &other) = delete;
LocaleMatcher &operator=(const LocaleMatcher &other) = delete;
int32_t putIfAbsent(const LSR &lsr, int32_t i, int32_t suppLength, UErrorCode &errorCode);
std::optional<int32_t> getBestSuppIndex(LSR desiredLSR, LocaleLsrIterator *remainingIter, UErrorCode &errorCode) const;
const LikelySubtags &likelySubtags;
const LocaleDistance &localeDistance;
int32_t thresholdDistance;
int32_t demotionPerDesiredLocale;
ULocMatchFavorSubtag favorSubtag;
ULocMatchDirection direction;
// These are in input order.
const Locale ** supportedLocales;
LSR *lsrs;
int32_t supportedLocalesLength;
// These are in preference order: 1. Default locale 2. paradigm locales 3. others.
UHashtable *supportedLsrToIndex; // Map<LSR, Integer>
// Array versions of the supportedLsrToIndex keys and values.
// The distance lookup loops over the supportedLSRs and returns the index of the best match.
const LSR **supportedLSRs;
int32_t *supportedIndexes;
int32_t supportedLSRsLength;
Locale *ownedDefaultLocale;
const Locale *defaultLocale;
};
U_NAMESPACE_END
#endif // U_SHOW_CPLUSPLUS_API
#endif // __LOCALEMATCHER_H__

View File

@@ -0,0 +1,609 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2009-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: localpointer.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009nov13
* created by: Markus W. Scherer
*/
#ifndef __LOCALPOINTER_H__
#define __LOCALPOINTER_H__
/**
* \file
* \brief C++ API: "Smart pointers" for use with and in ICU4C C++ code.
*
* These classes are inspired by
* - std::auto_ptr
* - boost::scoped_ptr & boost::scoped_array
* - Taligent Safe Pointers (TOnlyPointerTo)
*
* but none of those provide for all of the goals for ICU smart pointers:
* - Smart pointer owns the object and releases it when it goes out of scope.
* - No transfer of ownership via copy/assignment to reduce misuse. Simpler & more robust.
* - ICU-compatible: No exceptions.
* - Need to be able to orphan/release the pointer and its ownership.
* - Need variants for normal C++ object pointers, C++ arrays, and ICU C service objects.
*
* For details see https://icu.unicode.org/design/cpp/scoped_ptr
*/
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include <memory>
U_NAMESPACE_BEGIN
/**
* "Smart pointer" base class; do not use directly: use LocalPointer etc.
*
* Base class for smart pointer classes that do not throw exceptions.
*
* Do not use this base class directly, since it does not delete its pointer.
* A subclass must implement methods that delete the pointer:
* Destructor and adoptInstead().
*
* There is no operator T *() provided because the programmer must decide
* whether to use getAlias() (without transfer of ownership) or orphan()
* (with transfer of ownership and NULLing of the pointer).
*
* @see LocalPointer
* @see LocalArray
* @see U_DEFINE_LOCAL_OPEN_POINTER
* @stable ICU 4.4
*/
template<typename T>
class LocalPointerBase {
public:
// No heap allocation. Use only on the stack.
static void* U_EXPORT2 operator new(size_t) = delete;
static void* U_EXPORT2 operator new[](size_t) = delete;
#if U_HAVE_PLACEMENT_NEW
static void* U_EXPORT2 operator new(size_t, void*) = delete;
#endif
/**
* Constructor takes ownership.
* @param p simple pointer to an object that is adopted
* @stable ICU 4.4
*/
explicit LocalPointerBase(T *p=nullptr) : ptr(p) {}
/**
* Destructor deletes the object it owns.
* Subclass must override: Base class does nothing.
* @stable ICU 4.4
*/
~LocalPointerBase() { /* delete ptr; */ }
/**
* nullptr check.
* @return true if ==nullptr
* @stable ICU 4.4
*/
UBool isNull() const { return ptr==nullptr; }
/**
* nullptr check.
* @return true if !=nullptr
* @stable ICU 4.4
*/
UBool isValid() const { return ptr!=nullptr; }
/**
* Comparison with a simple pointer, so that existing code
* with ==nullptr need not be changed.
* @param other simple pointer for comparison
* @return true if this pointer value equals other
* @stable ICU 4.4
*/
bool operator==(const T *other) const { return ptr==other; }
/**
* Comparison with a simple pointer, so that existing code
* with !=nullptr need not be changed.
* @param other simple pointer for comparison
* @return true if this pointer value differs from other
* @stable ICU 4.4
*/
bool operator!=(const T *other) const { return ptr!=other; }
/**
* Access without ownership change.
* @return the pointer value
* @stable ICU 4.4
*/
T *getAlias() const { return ptr; }
/**
* Access without ownership change.
* @return the pointer value as a reference
* @stable ICU 4.4
*/
T &operator*() const { return *ptr; }
/**
* Access without ownership change.
* @return the pointer value
* @stable ICU 4.4
*/
T *operator->() const { return ptr; }
/**
* Gives up ownership; the internal pointer becomes nullptr.
* @return the pointer value;
* caller becomes responsible for deleting the object
* @stable ICU 4.4
*/
T *orphan() {
T *p=ptr;
ptr=nullptr;
return p;
}
/**
* Deletes the object it owns,
* and adopts (takes ownership of) the one passed in.
* Subclass must override: Base class does not delete the object.
* @param p simple pointer to an object that is adopted
* @stable ICU 4.4
*/
void adoptInstead(T *p) {
// delete ptr;
ptr=p;
}
protected:
/**
* Actual pointer.
* @internal
*/
T *ptr;
private:
// No comparison operators with other LocalPointerBases.
bool operator==(const LocalPointerBase<T> &other) = delete;
bool operator!=(const LocalPointerBase<T> &other) = delete;
// No ownership sharing: No copy constructor, no assignment operator.
LocalPointerBase(const LocalPointerBase<T> &other) = delete;
void operator=(const LocalPointerBase<T> &other) = delete;
};
/**
* "Smart pointer" class, deletes objects via the standard C++ delete operator.
* For most methods see the LocalPointerBase base class.
*
* Usage example:
* \code
* LocalPointer<UnicodeString> s(new UnicodeString((UChar32)0x50005));
* int32_t length=s->length(); // 2
* char16_t lead=s->charAt(0); // 0xd900
* if(some condition) { return; } // no need to explicitly delete the pointer
* s.adoptInstead(new UnicodeString((char16_t)0xfffc));
* length=s->length(); // 1
* // no need to explicitly delete the pointer
* \endcode
*
* @see LocalPointerBase
* @stable ICU 4.4
*/
template<typename T>
class LocalPointer : public LocalPointerBase<T> {
public:
using LocalPointerBase<T>::operator*;
using LocalPointerBase<T>::operator->;
/**
* Constructor takes ownership.
* @param p simple pointer to an object that is adopted
* @stable ICU 4.4
*/
explicit LocalPointer(T *p=nullptr) : LocalPointerBase<T>(p) {}
/**
* Constructor takes ownership and reports an error if nullptr.
*
* This constructor is intended to be used with other-class constructors
* that may report a failure UErrorCode,
* so that callers need to check only for U_FAILURE(errorCode)
* and not also separately for isNull().
*
* @param p simple pointer to an object that is adopted
* @param errorCode in/out UErrorCode, set to U_MEMORY_ALLOCATION_ERROR
* if p==nullptr and no other failure code had been set
* @stable ICU 55
*/
LocalPointer(T *p, UErrorCode &errorCode) : LocalPointerBase<T>(p) {
if(p==nullptr && U_SUCCESS(errorCode)) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
}
/**
* Move constructor, leaves src with isNull().
* @param src source smart pointer
* @stable ICU 56
*/
LocalPointer(LocalPointer<T> &&src) noexcept : LocalPointerBase<T>(src.ptr) {
src.ptr=nullptr;
}
/**
* Constructs a LocalPointer from a C++11 std::unique_ptr.
* The LocalPointer steals the object owned by the std::unique_ptr.
*
* This constructor works via move semantics. If your std::unique_ptr is
* in a local variable, you must use std::move.
*
* @param p The std::unique_ptr from which the pointer will be stolen.
* @stable ICU 64
*/
explicit LocalPointer(std::unique_ptr<T> &&p)
: LocalPointerBase<T>(p.release()) {}
/**
* Destructor deletes the object it owns.
* @stable ICU 4.4
*/
~LocalPointer() {
delete LocalPointerBase<T>::ptr;
}
/**
* Move assignment operator, leaves src with isNull().
* The behavior is undefined if *this and src are the same object.
* @param src source smart pointer
* @return *this
* @stable ICU 56
*/
LocalPointer<T> &operator=(LocalPointer<T> &&src) noexcept {
delete LocalPointerBase<T>::ptr;
LocalPointerBase<T>::ptr=src.ptr;
src.ptr=nullptr;
return *this;
}
/**
* Move-assign from an std::unique_ptr to this LocalPointer.
* Steals the pointer from the std::unique_ptr.
*
* @param p The std::unique_ptr from which the pointer will be stolen.
* @return *this
* @stable ICU 64
*/
LocalPointer<T> &operator=(std::unique_ptr<T> &&p) noexcept {
adoptInstead(p.release());
return *this;
}
/**
* Swap pointers.
* @param other other smart pointer
* @stable ICU 56
*/
void swap(LocalPointer<T> &other) noexcept {
T *temp=LocalPointerBase<T>::ptr;
LocalPointerBase<T>::ptr=other.ptr;
other.ptr=temp;
}
/**
* Non-member LocalPointer swap function.
* @param p1 will get p2's pointer
* @param p2 will get p1's pointer
* @stable ICU 56
*/
friend inline void swap(LocalPointer<T> &p1, LocalPointer<T> &p2) noexcept {
p1.swap(p2);
}
/**
* Deletes the object it owns,
* and adopts (takes ownership of) the one passed in.
* @param p simple pointer to an object that is adopted
* @stable ICU 4.4
*/
void adoptInstead(T *p) {
delete LocalPointerBase<T>::ptr;
LocalPointerBase<T>::ptr=p;
}
/**
* Deletes the object it owns,
* and adopts (takes ownership of) the one passed in.
*
* If U_FAILURE(errorCode), then the current object is retained and the new one deleted.
*
* If U_SUCCESS(errorCode) but the input pointer is nullptr,
* then U_MEMORY_ALLOCATION_ERROR is set,
* the current object is deleted, and nullptr is set.
*
* @param p simple pointer to an object that is adopted
* @param errorCode in/out UErrorCode, set to U_MEMORY_ALLOCATION_ERROR
* if p==nullptr and no other failure code had been set
* @stable ICU 55
*/
void adoptInsteadAndCheckErrorCode(T *p, UErrorCode &errorCode) {
if(U_SUCCESS(errorCode)) {
delete LocalPointerBase<T>::ptr;
LocalPointerBase<T>::ptr=p;
if(p==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
} else {
delete p;
}
}
/**
* Conversion operator to a C++11 std::unique_ptr.
* Disowns the object and gives it to the returned std::unique_ptr.
*
* This operator works via move semantics. If your LocalPointer is
* in a local variable, you must use std::move.
*
* @return An std::unique_ptr owning the pointer previously owned by this
* icu::LocalPointer.
* @stable ICU 64
*/
operator std::unique_ptr<T> () && {
return std::unique_ptr<T>(LocalPointerBase<T>::orphan());
}
};
/**
* "Smart pointer" class, deletes objects via the C++ array delete[] operator.
* For most methods see the LocalPointerBase base class.
* Adds operator[] for array item access.
*
* Usage example:
* \code
* LocalArray<UnicodeString> a(new UnicodeString[2]);
* a[0].append((char16_t)0x61);
* if(some condition) { return; } // no need to explicitly delete the array
* a.adoptInstead(new UnicodeString[4]);
* a[3].append((char16_t)0x62).append((char16_t)0x63).reverse();
* // no need to explicitly delete the array
* \endcode
*
* @see LocalPointerBase
* @stable ICU 4.4
*/
template<typename T>
class LocalArray : public LocalPointerBase<T> {
public:
using LocalPointerBase<T>::operator*;
using LocalPointerBase<T>::operator->;
/**
* Constructor takes ownership.
* @param p simple pointer to an array of T objects that is adopted
* @stable ICU 4.4
*/
explicit LocalArray(T *p=nullptr) : LocalPointerBase<T>(p) {}
/**
* Constructor takes ownership and reports an error if nullptr.
*
* This constructor is intended to be used with other-class constructors
* that may report a failure UErrorCode,
* so that callers need to check only for U_FAILURE(errorCode)
* and not also separately for isNull().
*
* @param p simple pointer to an array of T objects that is adopted
* @param errorCode in/out UErrorCode, set to U_MEMORY_ALLOCATION_ERROR
* if p==nullptr and no other failure code had been set
* @stable ICU 56
*/
LocalArray(T *p, UErrorCode &errorCode) : LocalPointerBase<T>(p) {
if(p==nullptr && U_SUCCESS(errorCode)) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
}
/**
* Move constructor, leaves src with isNull().
* @param src source smart pointer
* @stable ICU 56
*/
LocalArray(LocalArray<T> &&src) noexcept : LocalPointerBase<T>(src.ptr) {
src.ptr=nullptr;
}
/**
* Constructs a LocalArray from a C++11 std::unique_ptr of an array type.
* The LocalPointer steals the array owned by the std::unique_ptr.
*
* This constructor works via move semantics. If your std::unique_ptr is
* in a local variable, you must use std::move.
*
* @param p The std::unique_ptr from which the array will be stolen.
* @stable ICU 64
*/
explicit LocalArray(std::unique_ptr<T[]> &&p)
: LocalPointerBase<T>(p.release()) {}
/**
* Destructor deletes the array it owns.
* @stable ICU 4.4
*/
~LocalArray() {
delete[] LocalPointerBase<T>::ptr;
}
/**
* Move assignment operator, leaves src with isNull().
* The behavior is undefined if *this and src are the same object.
* @param src source smart pointer
* @return *this
* @stable ICU 56
*/
LocalArray<T> &operator=(LocalArray<T> &&src) noexcept {
delete[] LocalPointerBase<T>::ptr;
LocalPointerBase<T>::ptr=src.ptr;
src.ptr=nullptr;
return *this;
}
/**
* Move-assign from an std::unique_ptr to this LocalPointer.
* Steals the array from the std::unique_ptr.
*
* @param p The std::unique_ptr from which the array will be stolen.
* @return *this
* @stable ICU 64
*/
LocalArray<T> &operator=(std::unique_ptr<T[]> &&p) noexcept {
adoptInstead(p.release());
return *this;
}
/**
* Swap pointers.
* @param other other smart pointer
* @stable ICU 56
*/
void swap(LocalArray<T> &other) noexcept {
T *temp=LocalPointerBase<T>::ptr;
LocalPointerBase<T>::ptr=other.ptr;
other.ptr=temp;
}
/**
* Non-member LocalArray swap function.
* @param p1 will get p2's pointer
* @param p2 will get p1's pointer
* @stable ICU 56
*/
friend inline void swap(LocalArray<T> &p1, LocalArray<T> &p2) noexcept {
p1.swap(p2);
}
/**
* Deletes the array it owns,
* and adopts (takes ownership of) the one passed in.
* @param p simple pointer to an array of T objects that is adopted
* @stable ICU 4.4
*/
void adoptInstead(T *p) {
delete[] LocalPointerBase<T>::ptr;
LocalPointerBase<T>::ptr=p;
}
/**
* Deletes the array it owns,
* and adopts (takes ownership of) the one passed in.
*
* If U_FAILURE(errorCode), then the current array is retained and the new one deleted.
*
* If U_SUCCESS(errorCode) but the input pointer is nullptr,
* then U_MEMORY_ALLOCATION_ERROR is set,
* the current array is deleted, and nullptr is set.
*
* @param p simple pointer to an array of T objects that is adopted
* @param errorCode in/out UErrorCode, set to U_MEMORY_ALLOCATION_ERROR
* if p==nullptr and no other failure code had been set
* @stable ICU 56
*/
void adoptInsteadAndCheckErrorCode(T *p, UErrorCode &errorCode) {
if(U_SUCCESS(errorCode)) {
delete[] LocalPointerBase<T>::ptr;
LocalPointerBase<T>::ptr=p;
if(p==nullptr) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
} else {
delete[] p;
}
}
/**
* Array item access (writable).
* No index bounds check.
* @param i array index
* @return reference to the array item
* @stable ICU 4.4
*/
T &operator[](ptrdiff_t i) const { return LocalPointerBase<T>::ptr[i]; }
/**
* Conversion operator to a C++11 std::unique_ptr.
* Disowns the object and gives it to the returned std::unique_ptr.
*
* This operator works via move semantics. If your LocalPointer is
* in a local variable, you must use std::move.
*
* @return An std::unique_ptr owning the pointer previously owned by this
* icu::LocalPointer.
* @stable ICU 64
*/
operator std::unique_ptr<T[]> () && {
return std::unique_ptr<T[]>(LocalPointerBase<T>::orphan());
}
};
/**
* \def U_DEFINE_LOCAL_OPEN_POINTER
* "Smart pointer" definition macro, deletes objects via the closeFunction.
* Defines a subclass of LocalPointerBase which works just
* like LocalPointer<Type> except that this subclass will use the closeFunction
* rather than the C++ delete operator.
*
* Usage example:
* \code
* LocalUCaseMapPointer csm(ucasemap_open(localeID, options, &errorCode));
* utf8OutLength=ucasemap_utf8ToLower(csm.getAlias(),
* utf8Out, (int32_t)sizeof(utf8Out),
* utf8In, utf8InLength, &errorCode);
* if(U_FAILURE(errorCode)) { return; } // no need to explicitly delete the UCaseMap
* \endcode
*
* @see LocalPointerBase
* @see LocalPointer
* @stable ICU 4.4
*/
#define U_DEFINE_LOCAL_OPEN_POINTER(LocalPointerClassName, Type, closeFunction) \
using LocalPointerClassName = internal::LocalOpenPointer<Type, closeFunction>
#ifndef U_IN_DOXYGEN
namespace internal {
/**
* Implementation, do not use directly: use U_DEFINE_LOCAL_OPEN_POINTER.
*
* @see U_DEFINE_LOCAL_OPEN_POINTER
* @internal
*/
template <typename Type, auto closeFunction>
class LocalOpenPointer : public LocalPointerBase<Type> {
using LocalPointerBase<Type>::ptr;
public:
using LocalPointerBase<Type>::operator*;
using LocalPointerBase<Type>::operator->;
explicit LocalOpenPointer(Type *p=nullptr) : LocalPointerBase<Type>(p) {}
LocalOpenPointer(LocalOpenPointer &&src) noexcept
: LocalPointerBase<Type>(src.ptr) {
src.ptr=nullptr;
}
/* TODO: Be agnostic of the deleter function signature from the user-provided std::unique_ptr? */
explicit LocalOpenPointer(std::unique_ptr<Type, decltype(closeFunction)> &&p)
: LocalPointerBase<Type>(p.release()) {}
~LocalOpenPointer() { if (ptr != nullptr) { closeFunction(ptr); } }
LocalOpenPointer &operator=(LocalOpenPointer &&src) noexcept {
if (ptr != nullptr) { closeFunction(ptr); }
LocalPointerBase<Type>::ptr=src.ptr;
src.ptr=nullptr;
return *this;
}
/* TODO: Be agnostic of the deleter function signature from the user-provided std::unique_ptr? */
LocalOpenPointer &operator=(std::unique_ptr<Type, decltype(closeFunction)> &&p) {
adoptInstead(p.release());
return *this;
}
void swap(LocalOpenPointer &other) noexcept {
Type *temp=LocalPointerBase<Type>::ptr;
LocalPointerBase<Type>::ptr=other.ptr;
other.ptr=temp;
}
friend inline void swap(LocalOpenPointer &p1, LocalOpenPointer &p2) noexcept {
p1.swap(p2);
}
void adoptInstead(Type *p) {
if (ptr != nullptr) { closeFunction(ptr); }
ptr=p;
}
operator std::unique_ptr<Type, decltype(closeFunction)> () && {
return std::unique_ptr<Type, decltype(closeFunction)>(LocalPointerBase<Type>::orphan(), closeFunction);
}
};
} // namespace internal
#endif
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif /* __LOCALPOINTER_H__ */

View File

@@ -0,0 +1,211 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
* Copyright (C) 2010-2016, International Business Machines Corporation and
* others. All Rights Reserved.
******************************************************************************
*/
#ifndef LOCDSPNM_H
#define LOCDSPNM_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
/**
* \file
* \brief C++ API: Provides display names of Locale and its components.
*/
#if !UCONFIG_NO_FORMATTING
#include "unicode/locid.h"
#include "unicode/strenum.h"
#include "unicode/uscript.h"
#include "unicode/uldnames.h"
#include "unicode/udisplaycontext.h"
U_NAMESPACE_BEGIN
/**
* Returns display names of Locales and components of Locales. For
* more information on language, script, region, variant, key, and
* values, see Locale.
* @stable ICU 4.4
*/
class U_COMMON_API LocaleDisplayNames : public UObject {
public:
/**
* Destructor.
* @stable ICU 4.4
*/
virtual ~LocaleDisplayNames();
/**
* Convenience overload of
* {@link #createInstance(const Locale& locale, UDialectHandling dialectHandling)}
* that specifies STANDARD dialect handling.
* @param locale the display locale
* @return a LocaleDisplayNames instance
* @stable ICU 4.4
*/
inline static LocaleDisplayNames* U_EXPORT2 createInstance(const Locale& locale);
/**
* Returns an instance of LocaleDisplayNames that returns names
* formatted for the provided locale, using the provided
* dialectHandling.
*
* @param locale the display locale
* @param dialectHandling how to select names for locales
* @return a LocaleDisplayNames instance
* @stable ICU 4.4
*/
static LocaleDisplayNames* U_EXPORT2 createInstance(const Locale& locale,
UDialectHandling dialectHandling);
/**
* Returns an instance of LocaleDisplayNames that returns names formatted
* for the provided locale, using the provided UDisplayContext settings.
*
* @param locale the display locale
* @param contexts List of one or more context settings (e.g. for dialect
* handling, capitalization, etc.
* @param length Number of items in the contexts list
* @return a LocaleDisplayNames instance
* @stable ICU 51
*/
static LocaleDisplayNames* U_EXPORT2 createInstance(const Locale& locale,
UDisplayContext *contexts, int32_t length);
// getters for state
/**
* Returns the locale used to determine the display names. This is
* not necessarily the same locale passed to {@link #createInstance}.
* @return the display locale
* @stable ICU 4.4
*/
virtual const Locale& getLocale() const = 0;
/**
* Returns the dialect handling used in the display names.
* @return the dialect handling enum
* @stable ICU 4.4
*/
virtual UDialectHandling getDialectHandling() const = 0;
/**
* Returns the UDisplayContext value for the specified UDisplayContextType.
* @param type the UDisplayContextType whose value to return
* @return the UDisplayContext for the specified type.
* @stable ICU 51
*/
virtual UDisplayContext getContext(UDisplayContextType type) const = 0;
// names for entire locales
/**
* Returns the display name of the provided locale.
* @param locale the locale whose display name to return
* @param result receives the locale's display name
* @return the display name of the provided locale
* @stable ICU 4.4
*/
virtual UnicodeString& localeDisplayName(const Locale& locale,
UnicodeString& result) const = 0;
/**
* Returns the display name of the provided locale id.
* @param localeId the id of the locale whose display name to return
* @param result receives the locale's display name
* @return the display name of the provided locale
* @stable ICU 4.4
*/
virtual UnicodeString& localeDisplayName(const char* localeId,
UnicodeString& result) const = 0;
// names for components of a locale id
/**
* Returns the display name of the provided language code.
* @param lang the language code
* @param result receives the language code's display name
* @return the display name of the provided language code
* @stable ICU 4.4
*/
virtual UnicodeString& languageDisplayName(const char* lang,
UnicodeString& result) const = 0;
/**
* Returns the display name of the provided script code.
* @param script the script code
* @param result receives the script code's display name
* @return the display name of the provided script code
* @stable ICU 4.4
*/
virtual UnicodeString& scriptDisplayName(const char* script,
UnicodeString& result) const = 0;
/**
* Returns the display name of the provided script code.
* @param scriptCode the script code number
* @param result receives the script code's display name
* @return the display name of the provided script code
* @stable ICU 4.4
*/
virtual UnicodeString& scriptDisplayName(UScriptCode scriptCode,
UnicodeString& result) const = 0;
/**
* Returns the display name of the provided region code.
* @param region the region code
* @param result receives the region code's display name
* @return the display name of the provided region code
* @stable ICU 4.4
*/
virtual UnicodeString& regionDisplayName(const char* region,
UnicodeString& result) const = 0;
/**
* Returns the display name of the provided variant.
* @param variant the variant string
* @param result receives the variant's display name
* @return the display name of the provided variant
* @stable ICU 4.4
*/
virtual UnicodeString& variantDisplayName(const char* variant,
UnicodeString& result) const = 0;
/**
* Returns the display name of the provided locale key.
* @param key the locale key name
* @param result receives the locale key's display name
* @return the display name of the provided locale key
* @stable ICU 4.4
*/
virtual UnicodeString& keyDisplayName(const char* key,
UnicodeString& result) const = 0;
/**
* Returns the display name of the provided value (used with the provided key).
* @param key the locale key name
* @param value the locale key's value
* @param result receives the value's display name
* @return the display name of the provided value
* @stable ICU 4.4
*/
virtual UnicodeString& keyValueDisplayName(const char* key, const char* value,
UnicodeString& result) const = 0;
};
inline LocaleDisplayNames* LocaleDisplayNames::createInstance(const Locale& locale) {
return LocaleDisplayNames::createInstance(locale, ULDN_STANDARD_NAMES);
}
U_NAMESPACE_END
#endif
#endif /* U_SHOW_CPLUSPLUS_API */
#endif

1297
thirdparty/icu4c/common/unicode/locid.h vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,950 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2011-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: messagepattern.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2011mar14
* created by: Markus W. Scherer
*/
#ifndef __MESSAGEPATTERN_H__
#define __MESSAGEPATTERN_H__
/**
* \file
* \brief C++ API: MessagePattern class: Parses and represents ICU MessageFormat patterns.
*/
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#if !UCONFIG_NO_FORMATTING
#include "unicode/parseerr.h"
#include "unicode/unistr.h"
/**
* Mode for when an apostrophe starts quoted literal text for MessageFormat output.
* The default is DOUBLE_OPTIONAL unless overridden via uconfig.h
* (UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE).
* <p>
* A pair of adjacent apostrophes always results in a single apostrophe in the output,
* even when the pair is between two single, text-quoting apostrophes.
* <p>
* The following table shows examples of desired MessageFormat.format() output
* with the pattern strings that yield that output.
* <p>
* <table>
* <tr>
* <th>Desired output</th>
* <th>DOUBLE_OPTIONAL</th>
* <th>DOUBLE_REQUIRED</th>
* </tr>
* <tr>
* <td>I see {many}</td>
* <td>I see '{many}'</td>
* <td>(same)</td>
* </tr>
* <tr>
* <td>I said {'Wow!'}</td>
* <td>I said '{''Wow!''}'</td>
* <td>(same)</td>
* </tr>
* <tr>
* <td>I don't know</td>
* <td>I don't know OR<br> I don''t know</td>
* <td>I don''t know</td>
* </tr>
* </table>
* @stable ICU 4.8
* @see UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE
*/
enum UMessagePatternApostropheMode {
/**
* A literal apostrophe is represented by
* either a single or a double apostrophe pattern character.
* Within a MessageFormat pattern, a single apostrophe only starts quoted literal text
* if it immediately precedes a curly brace {},
* or a pipe symbol | if inside a choice format,
* or a pound symbol # if inside a plural format.
* <p>
* This is the default behavior starting with ICU 4.8.
* @stable ICU 4.8
*/
UMSGPAT_APOS_DOUBLE_OPTIONAL,
/**
* A literal apostrophe must be represented by
* a double apostrophe pattern character.
* A single apostrophe always starts quoted literal text.
* <p>
* This is the behavior of ICU 4.6 and earlier, and of the JDK.
* @stable ICU 4.8
*/
UMSGPAT_APOS_DOUBLE_REQUIRED
};
/**
* @stable ICU 4.8
*/
typedef enum UMessagePatternApostropheMode UMessagePatternApostropheMode;
/**
* MessagePattern::Part type constants.
* @stable ICU 4.8
*/
enum UMessagePatternPartType {
/**
* Start of a message pattern (main or nested).
* The length is 0 for the top-level message
* and for a choice argument sub-message, otherwise 1 for the '{'.
* The value indicates the nesting level, starting with 0 for the main message.
* <p>
* There is always a later MSG_LIMIT part.
* @stable ICU 4.8
*/
UMSGPAT_PART_TYPE_MSG_START,
/**
* End of a message pattern (main or nested).
* The length is 0 for the top-level message and
* the last sub-message of a choice argument,
* otherwise 1 for the '}' or (in a choice argument style) the '|'.
* The value indicates the nesting level, starting with 0 for the main message.
* @stable ICU 4.8
*/
UMSGPAT_PART_TYPE_MSG_LIMIT,
/**
* Indicates a substring of the pattern string which is to be skipped when formatting.
* For example, an apostrophe that begins or ends quoted text
* would be indicated with such a part.
* The value is undefined and currently always 0.
* @stable ICU 4.8
*/
UMSGPAT_PART_TYPE_SKIP_SYNTAX,
/**
* Indicates that a syntax character needs to be inserted for auto-quoting.
* The length is 0.
* The value is the character code of the insertion character. (U+0027=APOSTROPHE)
* @stable ICU 4.8
*/
UMSGPAT_PART_TYPE_INSERT_CHAR,
/**
* Indicates a syntactic (non-escaped) # symbol in a plural variant.
* When formatting, replace this part's substring with the
* (value-offset) for the plural argument value.
* The value is undefined and currently always 0.
* @stable ICU 4.8
*/
UMSGPAT_PART_TYPE_REPLACE_NUMBER,
/**
* Start of an argument.
* The length is 1 for the '{'.
* The value is the ordinal value of the ArgType. Use getArgType().
* <p>
* This part is followed by either an ARG_NUMBER or ARG_NAME,
* followed by optional argument sub-parts (see UMessagePatternArgType constants)
* and finally an ARG_LIMIT part.
* @stable ICU 4.8
*/
UMSGPAT_PART_TYPE_ARG_START,
/**
* End of an argument.
* The length is 1 for the '}'.
* The value is the ordinal value of the ArgType. Use getArgType().
* @stable ICU 4.8
*/
UMSGPAT_PART_TYPE_ARG_LIMIT,
/**
* The argument number, provided by the value.
* @stable ICU 4.8
*/
UMSGPAT_PART_TYPE_ARG_NUMBER,
/**
* The argument name.
* The value is undefined and currently always 0.
* @stable ICU 4.8
*/
UMSGPAT_PART_TYPE_ARG_NAME,
/**
* The argument type.
* The value is undefined and currently always 0.
* @stable ICU 4.8
*/
UMSGPAT_PART_TYPE_ARG_TYPE,
/**
* The argument style text.
* The value is undefined and currently always 0.
* @stable ICU 4.8
*/
UMSGPAT_PART_TYPE_ARG_STYLE,
/**
* A selector substring in a "complex" argument style.
* The value is undefined and currently always 0.
* @stable ICU 4.8
*/
UMSGPAT_PART_TYPE_ARG_SELECTOR,
/**
* An integer value, for example the offset or an explicit selector value
* in a PluralFormat style.
* The part value is the integer value.
* @stable ICU 4.8
*/
UMSGPAT_PART_TYPE_ARG_INT,
/**
* A numeric value, for example the offset or an explicit selector value
* in a PluralFormat style.
* The part value is an index into an internal array of numeric values;
* use getNumericValue().
* @stable ICU 4.8
*/
UMSGPAT_PART_TYPE_ARG_DOUBLE
};
/**
* @stable ICU 4.8
*/
typedef enum UMessagePatternPartType UMessagePatternPartType;
/**
* Argument type constants.
* Returned by Part.getArgType() for ARG_START and ARG_LIMIT parts.
*
* Messages nested inside an argument are each delimited by MSG_START and MSG_LIMIT,
* with a nesting level one greater than the surrounding message.
* @stable ICU 4.8
*/
enum UMessagePatternArgType {
/**
* The argument has no specified type.
* @stable ICU 4.8
*/
UMSGPAT_ARG_TYPE_NONE,
/**
* The argument has a "simple" type which is provided by the ARG_TYPE part.
* An ARG_STYLE part might follow that.
* @stable ICU 4.8
*/
UMSGPAT_ARG_TYPE_SIMPLE,
/**
* The argument is a ChoiceFormat with one or more
* ((ARG_INT | ARG_DOUBLE), ARG_SELECTOR, message) tuples.
* @stable ICU 4.8
*/
UMSGPAT_ARG_TYPE_CHOICE,
/**
* The argument is a cardinal-number PluralFormat with an optional ARG_INT or ARG_DOUBLE offset
* (e.g., offset:1)
* and one or more (ARG_SELECTOR [explicit-value] message) tuples.
* If the selector has an explicit value (e.g., =2), then
* that value is provided by the ARG_INT or ARG_DOUBLE part preceding the message.
* Otherwise the message immediately follows the ARG_SELECTOR.
* @stable ICU 4.8
*/
UMSGPAT_ARG_TYPE_PLURAL,
/**
* The argument is a SelectFormat with one or more (ARG_SELECTOR, message) pairs.
* @stable ICU 4.8
*/
UMSGPAT_ARG_TYPE_SELECT,
/**
* The argument is an ordinal-number PluralFormat
* with the same style parts sequence and semantics as UMSGPAT_ARG_TYPE_PLURAL.
* @stable ICU 50
*/
UMSGPAT_ARG_TYPE_SELECTORDINAL
};
/**
* @stable ICU 4.8
*/
typedef enum UMessagePatternArgType UMessagePatternArgType;
/**
* \def UMSGPAT_ARG_TYPE_HAS_PLURAL_STYLE
* Returns true if the argument type has a plural style part sequence and semantics,
* for example UMSGPAT_ARG_TYPE_PLURAL and UMSGPAT_ARG_TYPE_SELECTORDINAL.
* @stable ICU 50
*/
#define UMSGPAT_ARG_TYPE_HAS_PLURAL_STYLE(argType) \
((argType)==UMSGPAT_ARG_TYPE_PLURAL || (argType)==UMSGPAT_ARG_TYPE_SELECTORDINAL)
enum {
/**
* Return value from MessagePattern.validateArgumentName() for when
* the string is a valid "pattern identifier" but not a number.
* @stable ICU 4.8
*/
UMSGPAT_ARG_NAME_NOT_NUMBER=-1,
/**
* Return value from MessagePattern.validateArgumentName() for when
* the string is invalid.
* It might not be a valid "pattern identifier",
* or it have only ASCII digits but there is a leading zero or the number is too large.
* @stable ICU 4.8
*/
UMSGPAT_ARG_NAME_NOT_VALID=-2
};
/**
* Special value that is returned by getNumericValue(Part) when no
* numeric value is defined for a part.
* @see MessagePattern.getNumericValue()
* @stable ICU 4.8
*/
#define UMSGPAT_NO_NUMERIC_VALUE ((double)(-123456789))
U_NAMESPACE_BEGIN
class MessagePatternDoubleList;
class MessagePatternPartsList;
/**
* Parses and represents ICU MessageFormat patterns.
* Also handles patterns for ChoiceFormat, PluralFormat and SelectFormat.
* Used in the implementations of those classes as well as in tools
* for message validation, translation and format conversion.
* <p>
* The parser handles all syntax relevant for identifying message arguments.
* This includes "complex" arguments whose style strings contain
* nested MessageFormat pattern substrings.
* For "simple" arguments (with no nested MessageFormat pattern substrings),
* the argument style is not parsed any further.
* <p>
* The parser handles named and numbered message arguments and allows both in one message.
* <p>
* Once a pattern has been parsed successfully, iterate through the parsed data
* with countParts(), getPart() and related methods.
* <p>
* The data logically represents a parse tree, but is stored and accessed
* as a list of "parts" for fast and simple parsing and to minimize object allocations.
* Arguments and nested messages are best handled via recursion.
* For every _START "part", MessagePattern.getLimitPartIndex() efficiently returns
* the index of the corresponding _LIMIT "part".
* <p>
* List of "parts":
* <pre>
* message = MSG_START (SKIP_SYNTAX | INSERT_CHAR | REPLACE_NUMBER | argument)* MSG_LIMIT
* argument = noneArg | simpleArg | complexArg
* complexArg = choiceArg | pluralArg | selectArg
*
* noneArg = ARG_START.NONE (ARG_NAME | ARG_NUMBER) ARG_LIMIT.NONE
* simpleArg = ARG_START.SIMPLE (ARG_NAME | ARG_NUMBER) ARG_TYPE [ARG_STYLE] ARG_LIMIT.SIMPLE
* choiceArg = ARG_START.CHOICE (ARG_NAME | ARG_NUMBER) choiceStyle ARG_LIMIT.CHOICE
* pluralArg = ARG_START.PLURAL (ARG_NAME | ARG_NUMBER) pluralStyle ARG_LIMIT.PLURAL
* selectArg = ARG_START.SELECT (ARG_NAME | ARG_NUMBER) selectStyle ARG_LIMIT.SELECT
*
* choiceStyle = ((ARG_INT | ARG_DOUBLE) ARG_SELECTOR message)+
* pluralStyle = [ARG_INT | ARG_DOUBLE] (ARG_SELECTOR [ARG_INT | ARG_DOUBLE] message)+
* selectStyle = (ARG_SELECTOR message)+
* </pre>
* <ul>
* <li>Literal output text is not represented directly by "parts" but accessed
* between parts of a message, from one part's getLimit() to the next part's getIndex().
* <li><code>ARG_START.CHOICE</code> stands for an ARG_START Part with ArgType CHOICE.
* <li>In the choiceStyle, the ARG_SELECTOR has the '<', the '#' or
* the less-than-or-equal-to sign (U+2264).
* <li>In the pluralStyle, the first, optional numeric Part has the "offset:" value.
* The optional numeric Part between each (ARG_SELECTOR, message) pair
* is the value of an explicit-number selector like "=2",
* otherwise the selector is a non-numeric identifier.
* <li>The REPLACE_NUMBER Part can occur only in an immediate sub-message of the pluralStyle.
* </ul>
* <p>
* This class is not intended for public subclassing.
*
* @stable ICU 4.8
*/
class U_COMMON_API MessagePattern : public UObject {
public:
/**
* Constructs an empty MessagePattern with default UMessagePatternApostropheMode.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @stable ICU 4.8
*/
MessagePattern(UErrorCode &errorCode);
/**
* Constructs an empty MessagePattern.
* @param mode Explicit UMessagePatternApostropheMode.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @stable ICU 4.8
*/
MessagePattern(UMessagePatternApostropheMode mode, UErrorCode &errorCode);
/**
* Constructs a MessagePattern with default UMessagePatternApostropheMode and
* parses the MessageFormat pattern string.
* @param pattern a MessageFormat pattern string
* @param parseError Struct to receive information on the position
* of an error within the pattern.
* Can be nullptr.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* TODO: turn @throws into UErrorCode specifics?
* @throws IllegalArgumentException for syntax errors in the pattern string
* @throws IndexOutOfBoundsException if certain limits are exceeded
* (e.g., argument number too high, argument name too long, etc.)
* @throws NumberFormatException if a number could not be parsed
* @stable ICU 4.8
*/
MessagePattern(const UnicodeString &pattern, UParseError *parseError, UErrorCode &errorCode);
/**
* Copy constructor.
* @param other Object to copy.
* @stable ICU 4.8
*/
MessagePattern(const MessagePattern &other);
/**
* Assignment operator.
* @param other Object to copy.
* @return *this=other
* @stable ICU 4.8
*/
MessagePattern &operator=(const MessagePattern &other);
/**
* Destructor.
* @stable ICU 4.8
*/
virtual ~MessagePattern();
/**
* Parses a MessageFormat pattern string.
* @param pattern a MessageFormat pattern string
* @param parseError Struct to receive information on the position
* of an error within the pattern.
* Can be nullptr.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return *this
* @throws IllegalArgumentException for syntax errors in the pattern string
* @throws IndexOutOfBoundsException if certain limits are exceeded
* (e.g., argument number too high, argument name too long, etc.)
* @throws NumberFormatException if a number could not be parsed
* @stable ICU 4.8
*/
MessagePattern &parse(const UnicodeString &pattern,
UParseError *parseError, UErrorCode &errorCode);
/**
* Parses a ChoiceFormat pattern string.
* @param pattern a ChoiceFormat pattern string
* @param parseError Struct to receive information on the position
* of an error within the pattern.
* Can be nullptr.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return *this
* @throws IllegalArgumentException for syntax errors in the pattern string
* @throws IndexOutOfBoundsException if certain limits are exceeded
* (e.g., argument number too high, argument name too long, etc.)
* @throws NumberFormatException if a number could not be parsed
* @stable ICU 4.8
*/
MessagePattern &parseChoiceStyle(const UnicodeString &pattern,
UParseError *parseError, UErrorCode &errorCode);
/**
* Parses a PluralFormat pattern string.
* @param pattern a PluralFormat pattern string
* @param parseError Struct to receive information on the position
* of an error within the pattern.
* Can be nullptr.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return *this
* @throws IllegalArgumentException for syntax errors in the pattern string
* @throws IndexOutOfBoundsException if certain limits are exceeded
* (e.g., argument number too high, argument name too long, etc.)
* @throws NumberFormatException if a number could not be parsed
* @stable ICU 4.8
*/
MessagePattern &parsePluralStyle(const UnicodeString &pattern,
UParseError *parseError, UErrorCode &errorCode);
/**
* Parses a SelectFormat pattern string.
* @param pattern a SelectFormat pattern string
* @param parseError Struct to receive information on the position
* of an error within the pattern.
* Can be nullptr.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return *this
* @throws IllegalArgumentException for syntax errors in the pattern string
* @throws IndexOutOfBoundsException if certain limits are exceeded
* (e.g., argument number too high, argument name too long, etc.)
* @throws NumberFormatException if a number could not be parsed
* @stable ICU 4.8
*/
MessagePattern &parseSelectStyle(const UnicodeString &pattern,
UParseError *parseError, UErrorCode &errorCode);
/**
* Clears this MessagePattern.
* countParts() will return 0.
* @stable ICU 4.8
*/
void clear();
/**
* Clears this MessagePattern and sets the UMessagePatternApostropheMode.
* countParts() will return 0.
* @param mode The new UMessagePatternApostropheMode.
* @stable ICU 4.8
*/
void clearPatternAndSetApostropheMode(UMessagePatternApostropheMode mode) {
clear();
aposMode=mode;
}
/**
* @param other another object to compare with.
* @return true if this object is equivalent to the other one.
* @stable ICU 4.8
*/
bool operator==(const MessagePattern &other) const;
/**
* @param other another object to compare with.
* @return false if this object is equivalent to the other one.
* @stable ICU 4.8
*/
inline bool operator!=(const MessagePattern &other) const {
return !operator==(other);
}
/**
* @return A hash code for this object.
* @stable ICU 4.8
*/
int32_t hashCode() const;
/**
* @return this instance's UMessagePatternApostropheMode.
* @stable ICU 4.8
*/
UMessagePatternApostropheMode getApostropheMode() const {
return aposMode;
}
// Java has package-private jdkAposMode() here.
// In C++, this is declared in the MessageImpl class.
/**
* @return the parsed pattern string (null if none was parsed).
* @stable ICU 4.8
*/
const UnicodeString &getPatternString() const {
return msg;
}
/**
* Does the parsed pattern have named arguments like {first_name}?
* @return true if the parsed pattern has at least one named argument.
* @stable ICU 4.8
*/
UBool hasNamedArguments() const {
return hasArgNames;
}
/**
* Does the parsed pattern have numbered arguments like {2}?
* @return true if the parsed pattern has at least one numbered argument.
* @stable ICU 4.8
*/
UBool hasNumberedArguments() const {
return hasArgNumbers;
}
/**
* Validates and parses an argument name or argument number string.
* An argument name must be a "pattern identifier", that is, it must contain
* no Unicode Pattern_Syntax or Pattern_White_Space characters.
* If it only contains ASCII digits, then it must be a small integer with no leading zero.
* @param name Input string.
* @return &gt;=0 if the name is a valid number,
* ARG_NAME_NOT_NUMBER (-1) if it is a "pattern identifier" but not all ASCII digits,
* ARG_NAME_NOT_VALID (-2) if it is neither.
* @stable ICU 4.8
*/
static int32_t validateArgumentName(const UnicodeString &name);
/**
* Returns a version of the parsed pattern string where each ASCII apostrophe
* is doubled (escaped) if it is not already, and if it is not interpreted as quoting syntax.
* <p>
* For example, this turns "I don't '{know}' {gender,select,female{h''er}other{h'im}}."
* into "I don''t '{know}' {gender,select,female{h''er}other{h''im}}."
* @return the deep-auto-quoted version of the parsed pattern string.
* @see MessageFormat.autoQuoteApostrophe()
* @stable ICU 4.8
*/
UnicodeString autoQuoteApostropheDeep() const;
class Part;
/**
* Returns the number of "parts" created by parsing the pattern string.
* Returns 0 if no pattern has been parsed or clear() was called.
* @return the number of pattern parts.
* @stable ICU 4.8
*/
int32_t countParts() const {
return partsLength;
}
/**
* Gets the i-th pattern "part".
* @param i The index of the Part data. (0..countParts()-1)
* @return the i-th pattern "part".
* @stable ICU 4.8
*/
const Part &getPart(int32_t i) const {
return parts[i];
}
/**
* Returns the UMessagePatternPartType of the i-th pattern "part".
* Convenience method for getPart(i).getType().
* @param i The index of the Part data. (0..countParts()-1)
* @return The UMessagePatternPartType of the i-th Part.
* @stable ICU 4.8
*/
UMessagePatternPartType getPartType(int32_t i) const {
return getPart(i).type;
}
/**
* Returns the pattern index of the specified pattern "part".
* Convenience method for getPart(partIndex).getIndex().
* @param partIndex The index of the Part data. (0..countParts()-1)
* @return The pattern index of this Part.
* @stable ICU 4.8
*/
int32_t getPatternIndex(int32_t partIndex) const {
return getPart(partIndex).index;
}
/**
* Returns the substring of the pattern string indicated by the Part.
* Convenience method for getPatternString().substring(part.getIndex(), part.getLimit()).
* @param part a part of this MessagePattern.
* @return the substring associated with part.
* @stable ICU 4.8
*/
UnicodeString getSubstring(const Part &part) const {
return msg.tempSubString(part.index, part.length);
}
/**
* Compares the part's substring with the input string s.
* @param part a part of this MessagePattern.
* @param s a string.
* @return true if getSubstring(part).equals(s).
* @stable ICU 4.8
*/
UBool partSubstringMatches(const Part &part, const UnicodeString &s) const {
return 0==msg.compare(part.index, part.length, s);
}
/**
* Returns the numeric value associated with an ARG_INT or ARG_DOUBLE.
* @param part a part of this MessagePattern.
* @return the part's numeric value, or UMSGPAT_NO_NUMERIC_VALUE if this is not a numeric part.
* @stable ICU 4.8
*/
double getNumericValue(const Part &part) const;
/**
* Returns the "offset:" value of a PluralFormat argument, or 0 if none is specified.
* @param pluralStart the index of the first PluralFormat argument style part. (0..countParts()-1)
* @return the "offset:" value.
* @stable ICU 4.8
*/
double getPluralOffset(int32_t pluralStart) const;
/**
* Returns the index of the ARG|MSG_LIMIT part corresponding to the ARG|MSG_START at start.
* @param start The index of some Part data (0..countParts()-1);
* this Part should be of Type ARG_START or MSG_START.
* @return The first i>start where getPart(i).getType()==ARG|MSG_LIMIT at the same nesting level,
* or start itself if getPartType(msgStart)!=ARG|MSG_START.
* @stable ICU 4.8
*/
int32_t getLimitPartIndex(int32_t start) const {
int32_t limit=getPart(start).limitPartIndex;
if(limit<start) {
return start;
}
return limit;
}
/**
* A message pattern "part", representing a pattern parsing event.
* There is a part for the start and end of a message or argument,
* for quoting and escaping of and with ASCII apostrophes,
* and for syntax elements of "complex" arguments.
* @stable ICU 4.8
*/
class Part : public UMemory {
public:
/**
* Default constructor, do not use.
* @internal
*/
Part() {}
/**
* Returns the type of this part.
* @return the part type.
* @stable ICU 4.8
*/
UMessagePatternPartType getType() const {
return type;
}
/**
* Returns the pattern string index associated with this Part.
* @return this part's pattern string index.
* @stable ICU 4.8
*/
int32_t getIndex() const {
return index;
}
/**
* Returns the length of the pattern substring associated with this Part.
* This is 0 for some parts.
* @return this part's pattern substring length.
* @stable ICU 4.8
*/
int32_t getLength() const {
return length;
}
/**
* Returns the pattern string limit (exclusive-end) index associated with this Part.
* Convenience method for getIndex()+getLength().
* @return this part's pattern string limit index, same as getIndex()+getLength().
* @stable ICU 4.8
*/
int32_t getLimit() const {
return index+length;
}
/**
* Returns a value associated with this part.
* See the documentation of each part type for details.
* @return the part value.
* @stable ICU 4.8
*/
int32_t getValue() const {
return value;
}
/**
* Returns the argument type if this part is of type ARG_START or ARG_LIMIT,
* otherwise UMSGPAT_ARG_TYPE_NONE.
* @return the argument type for this part.
* @stable ICU 4.8
*/
UMessagePatternArgType getArgType() const {
UMessagePatternPartType msgType=getType();
if(msgType ==UMSGPAT_PART_TYPE_ARG_START || msgType ==UMSGPAT_PART_TYPE_ARG_LIMIT) {
return static_cast<UMessagePatternArgType>(value);
} else {
return UMSGPAT_ARG_TYPE_NONE;
}
}
/**
* Indicates whether the Part type has a numeric value.
* If so, then that numeric value can be retrieved via MessagePattern.getNumericValue().
* @param type The Part type to be tested.
* @return true if the Part type has a numeric value.
* @stable ICU 4.8
*/
static UBool hasNumericValue(UMessagePatternPartType type) {
return type==UMSGPAT_PART_TYPE_ARG_INT || type==UMSGPAT_PART_TYPE_ARG_DOUBLE;
}
/**
* @param other another object to compare with.
* @return true if this object is equivalent to the other one.
* @stable ICU 4.8
*/
bool operator==(const Part &other) const;
/**
* @param other another object to compare with.
* @return false if this object is equivalent to the other one.
* @stable ICU 4.8
*/
inline bool operator!=(const Part &other) const {
return !operator==(other);
}
/**
* @return A hash code for this object.
* @stable ICU 4.8
*/
int32_t hashCode() const {
return ((type*37+index)*37+length)*37+value;
}
private:
friend class MessagePattern;
static const int32_t MAX_LENGTH=0xffff;
static const int32_t MAX_VALUE=0x7fff;
static const int32_t MAX_NESTED_LEVELS=0x03ff;
// Some fields are not final because they are modified during pattern parsing.
// After pattern parsing, the parts are effectively immutable.
UMessagePatternPartType type;
int32_t index;
uint16_t length;
int16_t value;
int32_t limitPartIndex;
};
private:
void preParse(const UnicodeString &pattern, UParseError *parseError, UErrorCode &errorCode);
void postParse();
int32_t parseMessage(int32_t index, int32_t msgStartLength,
int32_t nestingLevel, UMessagePatternArgType parentType,
UParseError *parseError, UErrorCode &errorCode);
int32_t parseArg(int32_t index, int32_t argStartLength, int32_t nestingLevel,
UParseError *parseError, UErrorCode &errorCode);
int32_t parseSimpleStyle(int32_t index, UParseError *parseError, UErrorCode &errorCode);
int32_t parseChoiceStyle(int32_t index, int32_t nestingLevel,
UParseError *parseError, UErrorCode &errorCode);
int32_t parsePluralOrSelectStyle(UMessagePatternArgType argType, int32_t index, int32_t nestingLevel,
UParseError *parseError, UErrorCode &errorCode);
/**
* Validates and parses an argument name or argument number string.
* This internal method assumes that the input substring is a "pattern identifier".
* @return &gt;=0 if the name is a valid number,
* ARG_NAME_NOT_NUMBER (-1) if it is a "pattern identifier" but not all ASCII digits,
* ARG_NAME_NOT_VALID (-2) if it is neither.
* @see #validateArgumentName(String)
*/
static int32_t parseArgNumber(const UnicodeString &s, int32_t start, int32_t limit);
int32_t parseArgNumber(int32_t start, int32_t limit) {
return parseArgNumber(msg, start, limit);
}
/**
* Parses a number from the specified message substring.
* @param start start index into the message string
* @param limit limit index into the message string, must be start<limit
* @param allowInfinity true if U+221E is allowed (for ChoiceFormat)
* @param parseError
* @param errorCode
*/
void parseDouble(int32_t start, int32_t limit, UBool allowInfinity,
UParseError *parseError, UErrorCode &errorCode);
// Java has package-private appendReducedApostrophes() here.
// In C++, this is declared in the MessageImpl class.
int32_t skipWhiteSpace(int32_t index);
int32_t skipIdentifier(int32_t index);
/**
* Skips a sequence of characters that could occur in a double value.
* Does not fully parse or validate the value.
*/
int32_t skipDouble(int32_t index);
static UBool isArgTypeChar(UChar32 c);
UBool isChoice(int32_t index);
UBool isPlural(int32_t index);
UBool isSelect(int32_t index);
UBool isOrdinal(int32_t index);
/**
* @return true if we are inside a MessageFormat (sub-)pattern,
* as opposed to inside a top-level choice/plural/select pattern.
*/
UBool inMessageFormatPattern(int32_t nestingLevel);
/**
* @return true if we are in a MessageFormat sub-pattern
* of a top-level ChoiceFormat pattern.
*/
UBool inTopLevelChoiceMessage(int32_t nestingLevel, UMessagePatternArgType parentType);
void addPart(UMessagePatternPartType type, int32_t index, int32_t length,
int32_t value, UErrorCode &errorCode);
void addLimitPart(int32_t start,
UMessagePatternPartType type, int32_t index, int32_t length,
int32_t value, UErrorCode &errorCode);
void addArgDoublePart(double numericValue, int32_t start, int32_t length, UErrorCode &errorCode);
void setParseError(UParseError *parseError, int32_t index);
UBool init(UErrorCode &errorCode);
UBool copyStorage(const MessagePattern &other, UErrorCode &errorCode);
UMessagePatternApostropheMode aposMode;
UnicodeString msg;
// ArrayList<Part> parts=new ArrayList<Part>();
MessagePatternPartsList *partsList;
Part *parts;
int32_t partsLength;
// ArrayList<Double> numericValues;
MessagePatternDoubleList *numericValuesList;
double *numericValues;
int32_t numericValuesLength;
UBool hasArgNames;
UBool hasArgNumbers;
UBool needsAutoQuoting;
};
U_NAMESPACE_END
#endif // !UCONFIG_NO_FORMATTING
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // __MESSAGEPATTERN_H__

View File

@@ -0,0 +1,791 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2009-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: normalizer2.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009nov22
* created by: Markus W. Scherer
*/
#ifndef __NORMALIZER2_H__
#define __NORMALIZER2_H__
/**
* \file
* \brief C++ API: New API for Unicode Normalization.
*/
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/stringpiece.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/unorm2.h"
U_NAMESPACE_BEGIN
class ByteSink;
/**
* Unicode normalization functionality for standard Unicode normalization or
* for using custom mapping tables.
* All instances of this class are unmodifiable/immutable.
* Instances returned by getInstance() are singletons that must not be deleted by the caller.
* The Normalizer2 class is not intended for public subclassing.
*
* The primary functions are to produce a normalized string and to detect whether
* a string is already normalized.
* The most commonly used normalization forms are those defined in
* http://www.unicode.org/unicode/reports/tr15/
* However, this API supports additional normalization forms for specialized purposes.
* For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
* and can be used in implementations of UTS #46.
*
* Not only are the standard compose and decompose modes supplied,
* but additional modes are provided as documented in the Mode enum.
*
* Some of the functions in this class identify normalization boundaries.
* At a normalization boundary, the portions of the string
* before it and starting from it do not interact and can be handled independently.
*
* The spanQuickCheckYes() stops at a normalization boundary.
* When the goal is a normalized string, then the text before the boundary
* can be copied, and the remainder can be processed with normalizeSecondAndAppend().
*
* The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
* a character is guaranteed to be at a normalization boundary,
* regardless of context.
* This is used for moving from one normalization boundary to the next
* or preceding boundary, and for performing iterative normalization.
*
* Iterative normalization is useful when only a small portion of a
* longer string needs to be processed.
* For example, in ICU, iterative normalization is used by the NormalizationTransliterator
* (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
* (to process only the substring for which sort key bytes are computed).
*
* The set of normalization boundaries returned by these functions may not be
* complete: There may be more boundaries that could be returned.
* Different functions may return different boundaries.
* @stable ICU 4.4
*/
class U_COMMON_API Normalizer2 : public UObject {
public:
/**
* Destructor.
* @stable ICU 4.4
*/
~Normalizer2();
/**
* Returns a Normalizer2 instance for Unicode NFC normalization.
* Same as getInstance(nullptr, "nfc", UNORM2_COMPOSE, errorCode).
* Returns an unmodifiable singleton instance. Do not delete it.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
static const Normalizer2 *
getNFCInstance(UErrorCode &errorCode);
/**
* Returns a Normalizer2 instance for Unicode NFD normalization.
* Same as getInstance(nullptr, "nfc", UNORM2_DECOMPOSE, errorCode).
* Returns an unmodifiable singleton instance. Do not delete it.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
static const Normalizer2 *
getNFDInstance(UErrorCode &errorCode);
/**
* Returns a Normalizer2 instance for Unicode NFKC normalization.
* Same as getInstance(nullptr, "nfkc", UNORM2_COMPOSE, errorCode).
* Returns an unmodifiable singleton instance. Do not delete it.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
static const Normalizer2 *
getNFKCInstance(UErrorCode &errorCode);
/**
* Returns a Normalizer2 instance for Unicode NFKD normalization.
* Same as getInstance(nullptr, "nfkc", UNORM2_DECOMPOSE, errorCode).
* Returns an unmodifiable singleton instance. Do not delete it.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
static const Normalizer2 *
getNFKDInstance(UErrorCode &errorCode);
/**
* Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization
* which is equivalent to applying the NFKC_Casefold mappings and then NFC.
* See https://www.unicode.org/reports/tr44/#NFKC_Casefold
*
* Same as getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, errorCode).
* Returns an unmodifiable singleton instance. Do not delete it.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
static const Normalizer2 *
getNFKCCasefoldInstance(UErrorCode &errorCode);
/**
* Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization
* which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC.
* See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold
*
* Same as getInstance(nullptr, "nfkc_scf", UNORM2_COMPOSE, errorCode).
* Returns an unmodifiable singleton instance. Do not delete it.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested Normalizer2, if successful
* @stable ICU 74
*/
static const Normalizer2 *
getNFKCSimpleCasefoldInstance(UErrorCode &errorCode);
/**
* Returns a Normalizer2 instance which uses the specified data file
* (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
* and which composes or decomposes text according to the specified mode.
* Returns an unmodifiable singleton instance. Do not delete it.
*
* Use packageName=nullptr for data files that are part of ICU's own data.
* Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
* Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
* Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
*
* @param packageName nullptr for ICU built-in data, otherwise application data package name
* @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file
* @param mode normalization mode (compose or decompose etc.)
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested Normalizer2, if successful
* @stable ICU 4.4
*/
static const Normalizer2 *
getInstance(const char *packageName,
const char *name,
UNormalization2Mode mode,
UErrorCode &errorCode);
/**
* Returns the normalized form of the source string.
* @param src source string
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return normalized src
* @stable ICU 4.4
*/
UnicodeString
normalize(const UnicodeString &src, UErrorCode &errorCode) const {
UnicodeString result;
normalize(src, result, errorCode);
return result;
}
/**
* Writes the normalized form of the source string to the destination string
* (replacing its contents) and returns the destination string.
* The source and destination strings must be different objects.
* @param src source string
* @param dest destination string; its contents is replaced with normalized src
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return dest
* @stable ICU 4.4
*/
virtual UnicodeString &
normalize(const UnicodeString &src,
UnicodeString &dest,
UErrorCode &errorCode) const = 0;
/**
* Normalizes a UTF-8 string and optionally records how source substrings
* relate to changed and unchanged result substrings.
*
* Implemented completely for all built-in modes except for FCD.
* The base class implementation converts to & from UTF-16 and does not support edits.
*
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src Source UTF-8 string.
* @param sink A ByteSink to which the normalized UTF-8 result string is written.
* sink.Flush() is called at the end.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be nullptr.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @stable ICU 60
*/
virtual void
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
Edits *edits, UErrorCode &errorCode) const;
/**
* Appends the normalized form of the second string to the first string
* (merging them at the boundary) and returns the first string.
* The result is normalized if the first string was normalized.
* The first and second strings must be different objects.
* @param first string, should be normalized
* @param second string, will be normalized
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return first
* @stable ICU 4.4
*/
virtual UnicodeString &
normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const = 0;
/**
* Appends the second string to the first string
* (merging them at the boundary) and returns the first string.
* The result is normalized if both the strings were normalized.
* The first and second strings must be different objects.
* @param first string, should be normalized
* @param second string, should be normalized
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return first
* @stable ICU 4.4
*/
virtual UnicodeString &
append(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const = 0;
/**
* Gets the decomposition mapping of c.
* Roughly equivalent to normalizing the String form of c
* on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
* returns false and does not write a string
* if c does not have a decomposition mapping in this instance's data.
* This function is independent of the mode of the Normalizer2.
* @param c code point
* @param decomposition String object which will be set to c's
* decomposition mapping, if there is one.
* @return true if c has a decomposition, otherwise false
* @stable ICU 4.6
*/
virtual UBool
getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
/**
* Gets the raw decomposition mapping of c.
*
* This is similar to the getDecomposition() method but returns the
* raw decomposition mapping as specified in UnicodeData.txt or
* (for custom data) in the mapping files processed by the gennorm2 tool.
* By contrast, getDecomposition() returns the processed,
* recursively-decomposed version of this mapping.
*
* When used on a standard NFKC Normalizer2 instance,
* getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
*
* When used on a standard NFC Normalizer2 instance,
* it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
* in this case, the result contains either one or two code points (=1..4 char16_ts).
*
* This function is independent of the mode of the Normalizer2.
* The default implementation returns false.
* @param c code point
* @param decomposition String object which will be set to c's
* raw decomposition mapping, if there is one.
* @return true if c has a decomposition, otherwise false
* @stable ICU 49
*/
virtual UBool
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
/**
* Performs pairwise composition of a & b and returns the composite if there is one.
*
* Returns a composite code point c only if c has a two-way mapping to a+b.
* In standard Unicode normalization, this means that
* c has a canonical decomposition to a+b
* and c does not have the Full_Composition_Exclusion property.
*
* This function is independent of the mode of the Normalizer2.
* The default implementation returns a negative value.
* @param a A (normalization starter) code point.
* @param b Another code point.
* @return The non-negative composite code point if there is one; otherwise a negative value.
* @stable ICU 49
*/
virtual UChar32
composePair(UChar32 a, UChar32 b) const;
/**
* Gets the combining class of c.
* The default implementation returns 0
* but all standard implementations return the Unicode Canonical_Combining_Class value.
* @param c code point
* @return c's combining class
* @stable ICU 49
*/
virtual uint8_t
getCombiningClass(UChar32 c) const;
/**
* Tests if the string is normalized.
* Internally, in cases where the quickCheck() method would return "maybe"
* (which is only possible for the two COMPOSE modes) this method
* resolves to "yes" or "no" to provide a definitive result,
* at the cost of doing more work in those cases.
* @param s input string
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return true if s is normalized
* @stable ICU 4.4
*/
virtual UBool
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
/**
* Tests if the UTF-8 string is normalized.
* Internally, in cases where the quickCheck() method would return "maybe"
* (which is only possible for the two COMPOSE modes) this method
* resolves to "yes" or "no" to provide a definitive result,
* at the cost of doing more work in those cases.
*
* This works for all normalization modes.
* It is optimized for UTF-8 for all built-in modes except for FCD.
* The base class implementation converts to UTF-16 and calls isNormalized().
*
* @param s UTF-8 input string
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return true if s is normalized
* @stable ICU 60
*/
virtual UBool
isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
/**
* Tests if the string is normalized.
* For the two COMPOSE modes, the result could be "maybe" in cases that
* would take a little more work to resolve definitively.
* Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
* combination of quick check + normalization, to avoid
* re-checking the "yes" prefix.
* @param s input string
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return UNormalizationCheckResult
* @stable ICU 4.4
*/
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
/**
* Returns the end of the normalized substring of the input string.
* In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
* the substring <code>UnicodeString(s, 0, end)</code>
* will pass the quick check with a "yes" result.
*
* The returned end index is usually one or more characters before the
* "no" or "maybe" character: The end index is at a normalization boundary.
* (See the class documentation for more about normalization boundaries.)
*
* When the goal is a normalized string and most input strings are expected
* to be normalized already, then call this method,
* and if it returns a prefix shorter than the input string,
* copy that prefix and use normalizeSecondAndAppend() for the remainder.
* @param s input string
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return "yes" span end index
* @stable ICU 4.4
*/
virtual int32_t
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
/**
* Tests if the character always has a normalization boundary before it,
* regardless of context.
* If true, then the character does not normalization-interact with
* preceding characters.
* In other words, a string containing this character can be normalized
* by processing portions before this character and starting from this
* character independently.
* This is used for iterative normalization. See the class documentation for details.
* @param c character to test
* @return true if c has a normalization boundary before it
* @stable ICU 4.4
*/
virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
/**
* Tests if the character always has a normalization boundary after it,
* regardless of context.
* If true, then the character does not normalization-interact with
* following characters.
* In other words, a string containing this character can be normalized
* by processing portions up to this character and after this
* character independently.
* This is used for iterative normalization. See the class documentation for details.
* Note that this operation may be significantly slower than hasBoundaryBefore().
* @param c character to test
* @return true if c has a normalization boundary after it
* @stable ICU 4.4
*/
virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
/**
* Tests if the character is normalization-inert.
* If true, then the character does not change, nor normalization-interact with
* preceding or following characters.
* In other words, a string containing this character can be normalized
* by processing portions before this character and after this
* character independently.
* This is used for iterative normalization. See the class documentation for details.
* Note that this operation may be significantly slower than hasBoundaryBefore().
* @param c character to test
* @return true if c is normalization-inert
* @stable ICU 4.4
*/
virtual UBool isInert(UChar32 c) const = 0;
};
/**
* Normalization filtered by a UnicodeSet.
* Normalizes portions of the text contained in the filter set and leaves
* portions not contained in the filter set unchanged.
* Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
* Not-in-the-filter text is treated as "is normalized" and "quick check yes".
* This class implements all of (and only) the Normalizer2 API.
* An instance of this class is unmodifiable/immutable but is constructed and
* must be destructed by the owner.
* @stable ICU 4.4
*/
class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
public:
/**
* Constructs a filtered normalizer wrapping any Normalizer2 instance
* and a filter set.
* Both are aliased and must not be modified or deleted while this object
* is used.
* The filter set should be frozen; otherwise the performance will suffer greatly.
* @param n2 wrapped Normalizer2 instance
* @param filterSet UnicodeSet which determines the characters to be normalized
* @stable ICU 4.4
*/
FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
norm2(n2), set(filterSet) {}
/**
* Destructor.
* @stable ICU 4.4
*/
~FilteredNormalizer2();
/**
* Writes the normalized form of the source string to the destination string
* (replacing its contents) and returns the destination string.
* The source and destination strings must be different objects.
* @param src source string
* @param dest destination string; its contents is replaced with normalized src
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return dest
* @stable ICU 4.4
*/
virtual UnicodeString &
normalize(const UnicodeString &src,
UnicodeString &dest,
UErrorCode &errorCode) const override;
/**
* Normalizes a UTF-8 string and optionally records how source substrings
* relate to changed and unchanged result substrings.
*
* Implemented completely for most built-in modes except for FCD.
* The base class implementation converts to & from UTF-16 and does not support edits.
*
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src Source UTF-8 string.
* @param sink A ByteSink to which the normalized UTF-8 result string is written.
* sink.Flush() is called at the end.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first unless
* options includes U_EDITS_NO_RESET. edits can be nullptr.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @stable ICU 60
*/
virtual void
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
Edits *edits, UErrorCode &errorCode) const override;
/**
* Appends the normalized form of the second string to the first string
* (merging them at the boundary) and returns the first string.
* The result is normalized if the first string was normalized.
* The first and second strings must be different objects.
* @param first string, should be normalized
* @param second string, will be normalized
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return first
* @stable ICU 4.4
*/
virtual UnicodeString &
normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const override;
/**
* Appends the second string to the first string
* (merging them at the boundary) and returns the first string.
* The result is normalized if both the strings were normalized.
* The first and second strings must be different objects.
* @param first string, should be normalized
* @param second string, should be normalized
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return first
* @stable ICU 4.4
*/
virtual UnicodeString &
append(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const override;
/**
* Gets the decomposition mapping of c.
* For details see the base class documentation.
*
* This function is independent of the mode of the Normalizer2.
* @param c code point
* @param decomposition String object which will be set to c's
* decomposition mapping, if there is one.
* @return true if c has a decomposition, otherwise false
* @stable ICU 4.6
*/
virtual UBool
getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
/**
* Gets the raw decomposition mapping of c.
* For details see the base class documentation.
*
* This function is independent of the mode of the Normalizer2.
* @param c code point
* @param decomposition String object which will be set to c's
* raw decomposition mapping, if there is one.
* @return true if c has a decomposition, otherwise false
* @stable ICU 49
*/
virtual UBool
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
/**
* Performs pairwise composition of a & b and returns the composite if there is one.
* For details see the base class documentation.
*
* This function is independent of the mode of the Normalizer2.
* @param a A (normalization starter) code point.
* @param b Another code point.
* @return The non-negative composite code point if there is one; otherwise a negative value.
* @stable ICU 49
*/
virtual UChar32
composePair(UChar32 a, UChar32 b) const override;
/**
* Gets the combining class of c.
* The default implementation returns 0
* but all standard implementations return the Unicode Canonical_Combining_Class value.
* @param c code point
* @return c's combining class
* @stable ICU 49
*/
virtual uint8_t
getCombiningClass(UChar32 c) const override;
/**
* Tests if the string is normalized.
* For details see the Normalizer2 base class documentation.
* @param s input string
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return true if s is normalized
* @stable ICU 4.4
*/
virtual UBool
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
/**
* Tests if the UTF-8 string is normalized.
* Internally, in cases where the quickCheck() method would return "maybe"
* (which is only possible for the two COMPOSE modes) this method
* resolves to "yes" or "no" to provide a definitive result,
* at the cost of doing more work in those cases.
*
* This works for all normalization modes.
* It is optimized for UTF-8 for all built-in modes except for FCD.
* The base class implementation converts to UTF-16 and calls isNormalized().
*
* @param s UTF-8 input string
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return true if s is normalized
* @stable ICU 60
*/
virtual UBool
isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
/**
* Tests if the string is normalized.
* For details see the Normalizer2 base class documentation.
* @param s input string
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return UNormalizationCheckResult
* @stable ICU 4.4
*/
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
/**
* Returns the end of the normalized substring of the input string.
* For details see the Normalizer2 base class documentation.
* @param s input string
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return "yes" span end index
* @stable ICU 4.4
*/
virtual int32_t
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
/**
* Tests if the character always has a normalization boundary before it,
* regardless of context.
* For details see the Normalizer2 base class documentation.
* @param c character to test
* @return true if c has a normalization boundary before it
* @stable ICU 4.4
*/
virtual UBool hasBoundaryBefore(UChar32 c) const override;
/**
* Tests if the character always has a normalization boundary after it,
* regardless of context.
* For details see the Normalizer2 base class documentation.
* @param c character to test
* @return true if c has a normalization boundary after it
* @stable ICU 4.4
*/
virtual UBool hasBoundaryAfter(UChar32 c) const override;
/**
* Tests if the character is normalization-inert.
* For details see the Normalizer2 base class documentation.
* @param c character to test
* @return true if c is normalization-inert
* @stable ICU 4.4
*/
virtual UBool isInert(UChar32 c) const override;
private:
UnicodeString &
normalize(const UnicodeString &src,
UnicodeString &dest,
USetSpanCondition spanCondition,
UErrorCode &errorCode) const;
void
normalizeUTF8(uint32_t options, const char *src, int32_t length,
ByteSink &sink, Edits *edits,
USetSpanCondition spanCondition,
UErrorCode &errorCode) const;
UnicodeString &
normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UBool doNormalize,
UErrorCode &errorCode) const;
const Normalizer2 &norm2;
const UnicodeSet &set;
};
U_NAMESPACE_END
#endif // !UCONFIG_NO_NORMALIZATION
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // __NORMALIZER2_H__

View File

@@ -0,0 +1,816 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
********************************************************************
* COPYRIGHT:
* Copyright (c) 1996-2015, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************
*/
#ifndef NORMLZR_H
#define NORMLZR_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
/**
* \file
* \brief C++ API: Unicode Normalization
*/
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/chariter.h"
#include "unicode/normalizer2.h"
#include "unicode/unistr.h"
#include "unicode/unorm.h"
#include "unicode/uobject.h"
U_NAMESPACE_BEGIN
/**
* Old Unicode normalization API.
*
* This API has been replaced by the Normalizer2 class and is only available
* for backward compatibility. This class simply delegates to the Normalizer2 class.
* There is one exception: The new API does not provide a replacement for Normalizer::compare().
*
* The Normalizer class supports the standard normalization forms described in
* <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
* Unicode Standard Annex #15: Unicode Normalization Forms</a>.
*
* The Normalizer class consists of two parts:
* - static functions that normalize strings or test if strings are normalized
* - a Normalizer object is an iterator that takes any kind of text and
* provides iteration over its normalized form
*
* The Normalizer class is not suitable for subclassing.
*
* For basic information about normalization forms and details about the C API
* please see the documentation in unorm.h.
*
* The iterator API with the Normalizer constructors and the non-static functions
* use a CharacterIterator as input. It is possible to pass a string which
* is then internally wrapped in a CharacterIterator.
* The input text is not normalized all at once, but incrementally where needed
* (providing efficient random access).
* This allows to pass in a large text but spend only a small amount of time
* normalizing a small part of that text.
* However, if the entire text is normalized, then the iterator will be
* slower than normalizing the entire text at once and iterating over the result.
* A possible use of the Normalizer iterator is also to report an index into the
* original text that is close to where the normalized characters come from.
*
* <em>Important:</em> The iterator API was cleaned up significantly for ICU 2.0.
* The earlier implementation reported the getIndex() inconsistently,
* and previous() could not be used after setIndex(), next(), first(), and current().
*
* Normalizer allows to start normalizing from anywhere in the input text by
* calling setIndexOnly(), first(), or last().
* Without calling any of these, the iterator will start at the beginning of the text.
*
* At any time, next() returns the next normalized code point (UChar32),
* with post-increment semantics (like CharacterIterator::next32PostInc()).
* previous() returns the previous normalized code point (UChar32),
* with pre-decrement semantics (like CharacterIterator::previous32()).
*
* current() returns the current code point
* (respectively the one at the newly set index) without moving
* the getIndex(). Note that if the text at the current position
* needs to be normalized, then these functions will do that.
* (This is why current() is not const.)
* It is more efficient to call setIndexOnly() instead, which does not
* normalize.
*
* getIndex() always refers to the position in the input text where the normalized
* code points are returned from. It does not always change with each returned
* code point.
* The code point that is returned from any of the functions
* corresponds to text at or after getIndex(), according to the
* function's iteration semantics (post-increment or pre-decrement).
*
* next() returns a code point from at or after the getIndex()
* from before the next() call. After the next() call, the getIndex()
* might have moved to where the next code point will be returned from
* (from a next() or current() call).
* This is semantically equivalent to array access with array[index++]
* (post-increment semantics).
*
* previous() returns a code point from at or after the getIndex()
* from after the previous() call.
* This is semantically equivalent to array access with array[--index]
* (pre-decrement semantics).
*
* Internally, the Normalizer iterator normalizes a small piece of text
* starting at the getIndex() and ending at a following "safe" index.
* The normalized results is stored in an internal string buffer, and
* the code points are iterated from there.
* With multiple iteration calls, this is repeated until the next piece
* of text needs to be normalized, and the getIndex() needs to be moved.
*
* The following "safe" index, the internal buffer, and the secondary
* iteration index into that buffer are not exposed on the API.
* This also means that it is currently not practical to return to
* a particular, arbitrary position in the text because one would need to
* know, and be able to set, in addition to the getIndex(), at least also the
* current index into the internal buffer.
* It is currently only possible to observe when getIndex() changes
* (with careful consideration of the iteration semantics),
* at which time the internal index will be 0.
* For example, if getIndex() is different after next() than before it,
* then the internal index is 0 and one can return to this getIndex()
* later with setIndexOnly().
*
* Note: While the setIndex() and getIndex() refer to indices in the
* underlying Unicode input text, the next() and previous() methods
* iterate through characters in the normalized output.
* This means that there is not necessarily a one-to-one correspondence
* between characters returned by next() and previous() and the indices
* passed to and returned from setIndex() and getIndex().
* It is for this reason that Normalizer does not implement the CharacterIterator interface.
*
* @author Laura Werner, Mark Davis, Markus Scherer
* @stable ICU 2.0
*/
class U_COMMON_API Normalizer : public UObject {
public:
#ifndef U_HIDE_DEPRECATED_API
/**
* If DONE is returned from an iteration function that returns a code point,
* then there are no more normalization results available.
* @deprecated ICU 56 Use Normalizer2 instead.
*/
enum {
DONE=0xffff
};
// Constructors
/**
* Creates a new <code>Normalizer</code> object for iterating over the
* normalized form of a given string.
* <p>
* @param str The string to be normalized. The normalization
* will start at the beginning of the string.
*
* @param mode The normalization mode.
* @deprecated ICU 56 Use Normalizer2 instead.
*/
Normalizer(const UnicodeString& str, UNormalizationMode mode);
/**
* Creates a new <code>Normalizer</code> object for iterating over the
* normalized form of a given string.
* <p>
* @param str The string to be normalized. The normalization
* will start at the beginning of the string.
*
* @param length Length of the string, or -1 if NUL-terminated.
* @param mode The normalization mode.
* @deprecated ICU 56 Use Normalizer2 instead.
*/
Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode);
/**
* Creates a new <code>Normalizer</code> object for iterating over the
* normalized form of the given text.
* <p>
* @param iter The input text to be normalized. The normalization
* will start at the beginning of the string.
*
* @param mode The normalization mode.
* @deprecated ICU 56 Use Normalizer2 instead.
*/
Normalizer(const CharacterIterator& iter, UNormalizationMode mode);
#endif /* U_HIDE_DEPRECATED_API */
#ifndef U_FORCE_HIDE_DEPRECATED_API
/**
* Copy constructor.
* @param copy The object to be copied.
* @deprecated ICU 56 Use Normalizer2 instead.
*/
Normalizer(const Normalizer& copy);
/**
* Destructor
* @deprecated ICU 56 Use Normalizer2 instead.
*/
virtual ~Normalizer();
#endif // U_FORCE_HIDE_DEPRECATED_API
//-------------------------------------------------------------------------
// Static utility methods
//-------------------------------------------------------------------------
#ifndef U_HIDE_DEPRECATED_API
/**
* Normalizes a <code>UnicodeString</code> according to the specified normalization mode.
* This is a wrapper for unorm_normalize(), using UnicodeString's.
*
* The <code>options</code> parameter specifies which optional
* <code>Normalizer</code> features are to be enabled for this operation.
*
* @param source the input string to be normalized.
* @param mode the normalization mode
* @param options the optional features to be enabled (0 for no options)
* @param result The normalized string (on output).
* @param status The error code.
* @deprecated ICU 56 Use Normalizer2 instead.
*/
static void U_EXPORT2 normalize(const UnicodeString& source,
UNormalizationMode mode, int32_t options,
UnicodeString& result,
UErrorCode &status);
/**
* Compose a <code>UnicodeString</code>.
* This is equivalent to normalize() with mode UNORM_NFC or UNORM_NFKC.
* This is a wrapper for unorm_normalize(), using UnicodeString's.
*
* The <code>options</code> parameter specifies which optional
* <code>Normalizer</code> features are to be enabled for this operation.
*
* @param source the string to be composed.
* @param compat Perform compatibility decomposition before composition.
* If this argument is <code>false</code>, only canonical
* decomposition will be performed.
* @param options the optional features to be enabled (0 for no options)
* @param result The composed string (on output).
* @param status The error code.
* @deprecated ICU 56 Use Normalizer2 instead.
*/
static void U_EXPORT2 compose(const UnicodeString& source,
UBool compat, int32_t options,
UnicodeString& result,
UErrorCode &status);
/**
* Static method to decompose a <code>UnicodeString</code>.
* This is equivalent to normalize() with mode UNORM_NFD or UNORM_NFKD.
* This is a wrapper for unorm_normalize(), using UnicodeString's.
*
* The <code>options</code> parameter specifies which optional
* <code>Normalizer</code> features are to be enabled for this operation.
*
* @param source the string to be decomposed.
* @param compat Perform compatibility decomposition.
* If this argument is <code>false</code>, only canonical
* decomposition will be performed.
* @param options the optional features to be enabled (0 for no options)
* @param result The decomposed string (on output).
* @param status The error code.
* @deprecated ICU 56 Use Normalizer2 instead.
*/
static void U_EXPORT2 decompose(const UnicodeString& source,
UBool compat, int32_t options,
UnicodeString& result,
UErrorCode &status);
/**
* Performing quick check on a string, to quickly determine if the string is
* in a particular normalization format.
* This is a wrapper for unorm_quickCheck(), using a UnicodeString.
*
* Three types of result can be returned UNORM_YES, UNORM_NO or
* UNORM_MAYBE. Result UNORM_YES indicates that the argument
* string is in the desired normalized format, UNORM_NO determines that
* argument string is not in the desired normalized format. A
* UNORM_MAYBE result indicates that a more thorough check is required,
* the user may have to put the string in its normalized form and compare the
* results.
* @param source string for determining if it is in a normalized format
* @param mode normalization format
* @param status A reference to a UErrorCode to receive any errors
* @return UNORM_YES, UNORM_NO or UNORM_MAYBE
*
* @see isNormalized
* @deprecated ICU 56 Use Normalizer2 instead.
*/
static inline UNormalizationCheckResult
quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status);
/**
* Performing quick check on a string; same as the other version of quickCheck
* but takes an extra options parameter like most normalization functions.
*
* @param source string for determining if it is in a normalized format
* @param mode normalization format
* @param options the optional features to be enabled (0 for no options)
* @param status A reference to a UErrorCode to receive any errors
* @return UNORM_YES, UNORM_NO or UNORM_MAYBE
*
* @see isNormalized
* @deprecated ICU 56 Use Normalizer2 instead.
*/
static UNormalizationCheckResult
quickCheck(const UnicodeString &source, UNormalizationMode mode, int32_t options, UErrorCode &status);
/**
* Test if a string is in a given normalization form.
* This is semantically equivalent to source.equals(normalize(source, mode)) .
*
* Unlike unorm_quickCheck(), this function returns a definitive result,
* never a "maybe".
* For NFD, NFKD, and FCD, both functions work exactly the same.
* For NFC and NFKC where quickCheck may return "maybe", this function will
* perform further tests to arrive at a true/false result.
*
* @param src String that is to be tested if it is in a normalization format.
* @param mode Which normalization form to test for.
* @param errorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return Boolean value indicating whether the source string is in the
* "mode" normalization form.
*
* @see quickCheck
* @deprecated ICU 56 Use Normalizer2 instead.
*/
static inline UBool
isNormalized(const UnicodeString &src, UNormalizationMode mode, UErrorCode &errorCode);
/**
* Test if a string is in a given normalization form; same as the other version of isNormalized
* but takes an extra options parameter like most normalization functions.
*
* @param src String that is to be tested if it is in a normalization format.
* @param mode Which normalization form to test for.
* @param options the optional features to be enabled (0 for no options)
* @param errorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return Boolean value indicating whether the source string is in the
* "mode" normalization form.
*
* @see quickCheck
* @deprecated ICU 56 Use Normalizer2 instead.
*/
static UBool
isNormalized(const UnicodeString &src, UNormalizationMode mode, int32_t options, UErrorCode &errorCode);
/**
* Concatenate normalized strings, making sure that the result is normalized as well.
*
* If both the left and the right strings are in
* the normalization form according to "mode/options",
* then the result will be
*
* \code
* dest=normalize(left+right, mode, options)
* \endcode
*
* For details see unorm_concatenate in unorm.h.
*
* @param left Left source string.
* @param right Right source string.
* @param result The output string.
* @param mode The normalization mode.
* @param options A bit set of normalization options.
* @param errorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return result
*
* @see unorm_concatenate
* @see normalize
* @see unorm_next
* @see unorm_previous
*
* @deprecated ICU 56 Use Normalizer2 instead.
*/
static UnicodeString &
U_EXPORT2 concatenate(const UnicodeString &left, const UnicodeString &right,
UnicodeString &result,
UNormalizationMode mode, int32_t options,
UErrorCode &errorCode);
#endif /* U_HIDE_DEPRECATED_API */
/**
* Compare two strings for canonical equivalence.
* Further options include case-insensitive comparison and
* code point order (as opposed to code unit order).
*
* Canonical equivalence between two strings is defined as their normalized
* forms (NFD or NFC) being identical.
* This function compares strings incrementally instead of normalizing
* (and optionally case-folding) both strings entirely,
* improving performance significantly.
*
* Bulk normalization is only necessary if the strings do not fulfill the FCD
* conditions. Only in this case, and only if the strings are relatively long,
* is memory allocated temporarily.
* For FCD strings and short non-FCD strings there is no memory allocation.
*
* Semantically, this is equivalent to
* strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
* where code point order and foldCase are all optional.
*
* UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match
* the case folding must be performed first, then the normalization.
*
* @param s1 First source string.
* @param s2 Second source string.
*
* @param options A bit set of options:
* - U_FOLD_CASE_DEFAULT or 0 is used for default options:
* Case-sensitive comparison in code unit order, and the input strings
* are quick-checked for FCD.
*
* - UNORM_INPUT_IS_FCD
* Set if the caller knows that both s1 and s2 fulfill the FCD conditions.
* If not set, the function will quickCheck for FCD
* and normalize if necessary.
*
* - U_COMPARE_CODE_POINT_ORDER
* Set to choose code point order instead of code unit order
* (see u_strCompare for details).
*
* - U_COMPARE_IGNORE_CASE
* Set to compare strings case-insensitively using case folding,
* instead of case-sensitively.
* If set, then the following case folding options are used.
*
* - Options as used with case-insensitive comparisons, currently:
*
* - U_FOLD_CASE_EXCLUDE_SPECIAL_I
* (see u_strCaseCompare for details)
*
* - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT
*
* @param errorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return <0 or 0 or >0 as usual for string comparisons
*
* @see unorm_compare
* @see normalize
* @see UNORM_FCD
* @see u_strCompare
* @see u_strCaseCompare
*
* @stable ICU 2.2
*/
static inline int32_t
compare(const UnicodeString &s1, const UnicodeString &s2,
uint32_t options,
UErrorCode &errorCode);
#ifndef U_HIDE_DEPRECATED_API
//-------------------------------------------------------------------------
// Iteration API
//-------------------------------------------------------------------------
/**
* Return the current character in the normalized text.
* current() may need to normalize some text at getIndex().
* The getIndex() is not changed.
*
* @return the current normalized code point
* @deprecated ICU 56 Use Normalizer2 instead.
*/
UChar32 current();
/**
* Return the first character in the normalized text.
* This is equivalent to setIndexOnly(startIndex()) followed by next().
* (Post-increment semantics.)
*
* @return the first normalized code point
* @deprecated ICU 56 Use Normalizer2 instead.
*/
UChar32 first();
/**
* Return the last character in the normalized text.
* This is equivalent to setIndexOnly(endIndex()) followed by previous().
* (Pre-decrement semantics.)
*
* @return the last normalized code point
* @deprecated ICU 56 Use Normalizer2 instead.
*/
UChar32 last();
/**
* Return the next character in the normalized text.
* (Post-increment semantics.)
* If the end of the text has already been reached, DONE is returned.
* The DONE value could be confused with a U+FFFF non-character code point
* in the text. If this is possible, you can test getIndex()<endIndex()
* before calling next(), or (getIndex()<endIndex() || last()!=DONE)
* after calling next(). (Calling last() will change the iterator state!)
*
* The C API unorm_next() is more efficient and does not have this ambiguity.
*
* @return the next normalized code point
* @deprecated ICU 56 Use Normalizer2 instead.
*/
UChar32 next();
/**
* Return the previous character in the normalized text and decrement.
* (Pre-decrement semantics.)
* If the beginning of the text has already been reached, DONE is returned.
* The DONE value could be confused with a U+FFFF non-character code point
* in the text. If this is possible, you can test
* (getIndex()>startIndex() || first()!=DONE). (Calling first() will change
* the iterator state!)
*
* The C API unorm_previous() is more efficient and does not have this ambiguity.
*
* @return the previous normalized code point
* @deprecated ICU 56 Use Normalizer2 instead.
*/
UChar32 previous();
/**
* Set the iteration position in the input text that is being normalized,
* without any immediate normalization.
* After setIndexOnly(), getIndex() will return the same index that is
* specified here.
*
* @param index the desired index in the input text.
* @deprecated ICU 56 Use Normalizer2 instead.
*/
void setIndexOnly(int32_t index);
/**
* Reset the index to the beginning of the text.
* This is equivalent to setIndexOnly(startIndex)).
* @deprecated ICU 56 Use Normalizer2 instead.
*/
void reset();
/**
* Retrieve the current iteration position in the input text that is
* being normalized.
*
* A following call to next() will return a normalized code point from
* the input text at or after this index.
*
* After a call to previous(), getIndex() will point at or before the
* position in the input text where the normalized code point
* was returned from with previous().
*
* @return the current index in the input text
* @deprecated ICU 56 Use Normalizer2 instead.
*/
int32_t getIndex() const;
/**
* Retrieve the index of the start of the input text. This is the begin index
* of the <code>CharacterIterator</code> or the start (i.e. index 0) of the string
* over which this <code>Normalizer</code> is iterating.
*
* @return the smallest index in the input text where the Normalizer operates
* @deprecated ICU 56 Use Normalizer2 instead.
*/
int32_t startIndex() const;
/**
* Retrieve the index of the end of the input text. This is the end index
* of the <code>CharacterIterator</code> or the length of the string
* over which this <code>Normalizer</code> is iterating.
* This end index is exclusive, i.e., the Normalizer operates only on characters
* before this index.
*
* @return the first index in the input text where the Normalizer does not operate
* @deprecated ICU 56 Use Normalizer2 instead.
*/
int32_t endIndex() const;
/**
* Returns true when both iterators refer to the same character in the same
* input text.
*
* @param that a Normalizer object to compare this one to
* @return comparison result
* @deprecated ICU 56 Use Normalizer2 instead.
*/
bool operator==(const Normalizer& that) const;
/**
* Returns false when both iterators refer to the same character in the same
* input text.
*
* @param that a Normalizer object to compare this one to
* @return comparison result
* @deprecated ICU 56 Use Normalizer2 instead.
*/
inline bool operator!=(const Normalizer& that) const;
/**
* Returns a pointer to a new Normalizer that is a clone of this one.
* The caller is responsible for deleting the new clone.
* @return a pointer to a new Normalizer
* @deprecated ICU 56 Use Normalizer2 instead.
*/
Normalizer* clone() const;
/**
* Generates a hash code for this iterator.
*
* @return the hash code
* @deprecated ICU 56 Use Normalizer2 instead.
*/
int32_t hashCode() const;
//-------------------------------------------------------------------------
// Property access methods
//-------------------------------------------------------------------------
/**
* Set the normalization mode for this object.
* <p>
* <b>Note:</b>If the normalization mode is changed while iterating
* over a string, calls to {@link #next() } and {@link #previous() } may
* return previously buffers characters in the old normalization mode
* until the iteration is able to re-sync at the next base character.
* It is safest to call {@link #setIndexOnly }, {@link #reset() },
* {@link #setText }, {@link #first() },
* {@link #last() }, etc. after calling <code>setMode</code>.
* <p>
* @param newMode the new mode for this <code>Normalizer</code>.
* @see #getUMode
* @deprecated ICU 56 Use Normalizer2 instead.
*/
void setMode(UNormalizationMode newMode);
/**
* Return the normalization mode for this object.
*
* This is an unusual name because there used to be a getMode() that
* returned a different type.
*
* @return the mode for this <code>Normalizer</code>
* @see #setMode
* @deprecated ICU 56 Use Normalizer2 instead.
*/
UNormalizationMode getUMode() const;
/**
* Set options that affect this <code>Normalizer</code>'s operation.
* Options do not change the basic composition or decomposition operation
* that is being performed, but they control whether
* certain optional portions of the operation are done.
* Currently the only available option is obsolete.
*
* It is possible to specify multiple options that are all turned on or off.
*
* @param option the option(s) whose value is/are to be set.
* @param value the new setting for the option. Use <code>true</code> to
* turn the option(s) on and <code>false</code> to turn it/them off.
*
* @see #getOption
* @deprecated ICU 56 Use Normalizer2 instead.
*/
void setOption(int32_t option,
UBool value);
/**
* Determine whether an option is turned on or off.
* If multiple options are specified, then the result is true if any
* of them are set.
* <p>
* @param option the option(s) that are to be checked
* @return true if any of the option(s) are set
* @see #setOption
* @deprecated ICU 56 Use Normalizer2 instead.
*/
UBool getOption(int32_t option) const;
/**
* Set the input text over which this <code>Normalizer</code> will iterate.
* The iteration position is set to the beginning.
*
* @param newText a string that replaces the current input text
* @param status a UErrorCode
* @deprecated ICU 56 Use Normalizer2 instead.
*/
void setText(const UnicodeString& newText,
UErrorCode &status);
/**
* Set the input text over which this <code>Normalizer</code> will iterate.
* The iteration position is set to the beginning.
*
* @param newText a CharacterIterator object that replaces the current input text
* @param status a UErrorCode
* @deprecated ICU 56 Use Normalizer2 instead.
*/
void setText(const CharacterIterator& newText,
UErrorCode &status);
/**
* Set the input text over which this <code>Normalizer</code> will iterate.
* The iteration position is set to the beginning.
*
* @param newText a string that replaces the current input text
* @param length the length of the string, or -1 if NUL-terminated
* @param status a UErrorCode
* @deprecated ICU 56 Use Normalizer2 instead.
*/
void setText(ConstChar16Ptr newText,
int32_t length,
UErrorCode &status);
/**
* Copies the input text into the UnicodeString argument.
*
* @param result Receives a copy of the text under iteration.
* @deprecated ICU 56 Use Normalizer2 instead.
*/
void getText(UnicodeString& result);
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
* @returns a UClassID for this class.
* @deprecated ICU 56 Use Normalizer2 instead.
*/
static UClassID U_EXPORT2 getStaticClassID();
#endif /* U_HIDE_DEPRECATED_API */
#ifndef U_FORCE_HIDE_DEPRECATED_API
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
* @return a UClassID for the actual class.
* @deprecated ICU 56 Use Normalizer2 instead.
*/
virtual UClassID getDynamicClassID() const override;
#endif // U_FORCE_HIDE_DEPRECATED_API
private:
//-------------------------------------------------------------------------
// Private functions
//-------------------------------------------------------------------------
Normalizer() = delete; // default constructor not implemented
Normalizer &operator=(const Normalizer &that) = delete; // assignment operator not implemented
// Private utility methods for iteration
// For documentation, see the source code
UBool nextNormalize();
UBool previousNormalize();
void init();
void clearBuffer();
//-------------------------------------------------------------------------
// Private data
//-------------------------------------------------------------------------
FilteredNormalizer2*fFilteredNorm2; // owned if not nullptr
const Normalizer2 *fNorm2; // not owned; may be equal to fFilteredNorm2
UNormalizationMode fUMode; // deprecated
int32_t fOptions;
// The input text and our position in it
CharacterIterator *text;
// The normalization buffer is the result of normalization
// of the source in [currentIndex..nextIndex[ .
int32_t currentIndex, nextIndex;
// A buffer for holding intermediate results
UnicodeString buffer;
int32_t bufferPos;
};
//-------------------------------------------------------------------------
// Inline implementations
//-------------------------------------------------------------------------
#ifndef U_HIDE_DEPRECATED_API
inline bool
Normalizer::operator!= (const Normalizer& other) const
{ return ! operator==(other); }
inline UNormalizationCheckResult
Normalizer::quickCheck(const UnicodeString& source,
UNormalizationMode mode,
UErrorCode &status) {
return quickCheck(source, mode, 0, status);
}
inline UBool
Normalizer::isNormalized(const UnicodeString& source,
UNormalizationMode mode,
UErrorCode &status) {
return isNormalized(source, mode, 0, status);
}
#endif /* U_HIDE_DEPRECATED_API */
inline int32_t
Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2,
uint32_t options,
UErrorCode &errorCode) {
// all argument checking is done in unorm_compare
return unorm_compare(toUCharPtr(s1.getBuffer()), s1.length(),
toUCharPtr(s2.getBuffer()), s2.length(),
options,
&errorCode);
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_NORMALIZATION */
#endif // NORMLZR_H
#endif /* U_SHOW_CPLUSPLUS_API */

View File

@@ -0,0 +1,94 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 1999-2005, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 03/14/00 aliu Creation.
* 06/27/00 aliu Change from C++ class to C struct
**********************************************************************
*/
#ifndef PARSEERR_H
#define PARSEERR_H
#include "unicode/utypes.h"
/**
* \file
* \brief C API: Parse Error Information
*/
/**
* The capacity of the context strings in UParseError.
* @stable ICU 2.0
*/
enum { U_PARSE_CONTEXT_LEN = 16 };
/**
* A UParseError struct is used to returned detailed information about
* parsing errors. It is used by ICU parsing engines that parse long
* rules, patterns, or programs, where the text being parsed is long
* enough that more information than a UErrorCode is needed to
* localize the error.
*
* <p>The line, offset, and context fields are optional; parsing
* engines may choose not to use to use them.
*
* <p>The preContext and postContext strings include some part of the
* context surrounding the error. If the source text is "let for=7"
* and "for" is the error (e.g., because it is a reserved word), then
* some examples of what a parser might produce are the following:
*
* <pre>
* preContext postContext
* "" "" The parser does not support context
* "let " "=7" Pre- and post-context only
* "let " "for=7" Pre- and post-context and error text
* "" "for" Error text only
* </pre>
*
* <p>Examples of engines which use UParseError (or may use it in the
* future) are Transliterator, RuleBasedBreakIterator, and
* RegexPattern.
*
* @stable ICU 2.0
*/
typedef struct UParseError {
/**
* The line on which the error occurred. If the parser uses this
* field, it sets it to the line number of the source text line on
* which the error appears, which will be a value >= 1. If the
* parse does not support line numbers, the value will be <= 0.
* @stable ICU 2.0
*/
int32_t line;
/**
* The character offset to the error. If the line field is >= 1,
* then this is the offset from the start of the line. Otherwise,
* this is the offset from the start of the text. If the parser
* does not support this field, it will have a value < 0.
* @stable ICU 2.0
*/
int32_t offset;
/**
* Textual context before the error. Null-terminated. The empty
* string if not supported by parser.
* @stable ICU 2.0
*/
UChar preContext[U_PARSE_CONTEXT_LEN];
/**
* The error itself and/or textual context after the error.
* Null-terminated. The empty string if not supported by parser.
* @stable ICU 2.0
*/
UChar postContext[U_PARSE_CONTEXT_LEN];
} UParseError;
#endif

View File

@@ -0,0 +1,237 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
* Copyright (C) 1997-2005, International Business Machines Corporation and others. All Rights Reserved.
*******************************************************************************
*
* File PARSEPOS.H
*
* Modification History:
*
* Date Name Description
* 07/09/97 helena Converted from java.
* 07/17/98 stephen Added errorIndex support.
* 05/11/99 stephen Cleaned up.
*******************************************************************************
*/
#ifndef PARSEPOS_H
#define PARSEPOS_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/uobject.h"
U_NAMESPACE_BEGIN
/**
* \file
* \brief C++ API: Canonical Iterator
*/
/**
* <code>ParsePosition</code> is a simple class used by <code>Format</code>
* and its subclasses to keep track of the current position during parsing.
* The <code>parseObject</code> method in the various <code>Format</code>
* classes requires a <code>ParsePosition</code> object as an argument.
*
* <p>
* By design, as you parse through a string with different formats,
* you can use the same <code>ParsePosition</code>, since the index parameter
* records the current position.
*
* The ParsePosition class is not suitable for subclassing.
*
* @version 1.3 10/30/97
* @author Mark Davis, Helena Shih
* @see java.text.Format
*/
class U_COMMON_API ParsePosition : public UObject {
public:
/**
* Default constructor, the index starts with 0 as default.
* @stable ICU 2.0
*/
ParsePosition()
: UObject(),
index(0),
errorIndex(-1)
{}
/**
* Create a new ParsePosition with the given initial index.
* @param newIndex the new text offset.
* @stable ICU 2.0
*/
ParsePosition(int32_t newIndex)
: UObject(),
index(newIndex),
errorIndex(-1)
{}
/**
* Copy constructor
* @param copy the object to be copied from.
* @stable ICU 2.0
*/
ParsePosition(const ParsePosition& copy)
: UObject(copy),
index(copy.index),
errorIndex(copy.errorIndex)
{}
/**
* Destructor
* @stable ICU 2.0
*/
virtual ~ParsePosition();
/**
* Assignment operator
* @stable ICU 2.0
*/
inline ParsePosition& operator=(const ParsePosition& copy);
/**
* Equality operator.
* @return true if the two parse positions are equal, false otherwise.
* @stable ICU 2.0
*/
inline bool operator==(const ParsePosition& that) const;
/**
* Equality operator.
* @return true if the two parse positions are not equal, false otherwise.
* @stable ICU 2.0
*/
inline bool operator!=(const ParsePosition& that) const;
/**
* Clone this object.
* Clones can be used concurrently in multiple threads.
* If an error occurs, then nullptr is returned.
* The caller must delete the clone.
*
* @return a clone of this object
*
* @see getDynamicClassID
* @stable ICU 2.8
*/
ParsePosition *clone() const;
/**
* Retrieve the current parse position. On input to a parse method, this
* is the index of the character at which parsing will begin; on output, it
* is the index of the character following the last character parsed.
* @return the current index.
* @stable ICU 2.0
*/
inline int32_t getIndex() const;
/**
* Set the current parse position.
* @param index the new index.
* @stable ICU 2.0
*/
inline void setIndex(int32_t index);
/**
* Set the index at which a parse error occurred. Formatters
* should set this before returning an error code from their
* parseObject method. The default value is -1 if this is not
* set.
* @stable ICU 2.0
*/
inline void setErrorIndex(int32_t ei);
/**
* Retrieve the index at which an error occurred, or -1 if the
* error index has not been set.
* @stable ICU 2.0
*/
inline int32_t getErrorIndex() const;
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*
* @stable ICU 2.2
*/
static UClassID U_EXPORT2 getStaticClassID();
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*
* @stable ICU 2.2
*/
virtual UClassID getDynamicClassID() const override;
private:
/**
* Input: the place you start parsing.
* <br>Output: position where the parse stopped.
* This is designed to be used serially,
* with each call setting index up for the next one.
*/
int32_t index;
/**
* The index at which a parse error occurred.
*/
int32_t errorIndex;
};
inline ParsePosition&
ParsePosition::operator=(const ParsePosition& copy)
{
index = copy.index;
errorIndex = copy.errorIndex;
return *this;
}
inline bool
ParsePosition::operator==(const ParsePosition& copy) const
{
if(index != copy.index || errorIndex != copy.errorIndex)
return false;
else
return true;
}
inline bool
ParsePosition::operator!=(const ParsePosition& copy) const
{
return !operator==(copy);
}
inline int32_t
ParsePosition::getIndex() const
{
return index;
}
inline void
ParsePosition::setIndex(int32_t offset)
{
this->index = offset;
}
inline int32_t
ParsePosition::getErrorIndex() const
{
return errorIndex;
}
inline void
ParsePosition::setErrorIndex(int32_t ei)
{
this->errorIndex = ei;
}
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif

View File

@@ -0,0 +1,861 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1997-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*
* FILE NAME : platform.h
*
* Date Name Description
* 05/13/98 nos Creation (content moved here from ptypes.h).
* 03/02/99 stephen Added AS400 support.
* 03/30/99 stephen Added Linux support.
* 04/13/99 stephen Reworked for autoconf.
******************************************************************************
*/
#ifndef _PLATFORM_H
#define _PLATFORM_H
#include "unicode/uconfig.h"
#include "unicode/uvernum.h"
/**
* \file
* \brief Basic types for the platform.
*
* This file used to be generated by autoconf/configure.
* Starting with ICU 49, platform.h is a normal source file,
* to simplify cross-compiling and working with non-autoconf/make build systems.
*
* When a value in this file does not work on a platform, then please
* try to derive it from the U_PLATFORM value
* (for which we might need a new value constant in rare cases)
* and/or from other macros that are predefined by the compiler
* or defined in standard (POSIX or platform or compiler) headers.
*
* As a temporary workaround, you can add an explicit \#define for some macros
* before it is first tested, or add an equivalent -D macro definition
* to the compiler's command line.
*
* Note: Some compilers provide ways to show the predefined macros.
* For example, with gcc you can compile an empty .c file and have the compiler
* print the predefined macros with
* \code
* gcc -E -dM -x c /dev/null | sort
* \endcode
* (You can provide an actual empty .c file rather than /dev/null.
* <code>-x c++</code> is for C++.)
*/
/**
* Define some things so that they can be documented.
* @internal
*/
#ifdef U_IN_DOXYGEN
/*
* Problem: "platform.h:335: warning: documentation for unknown define U_HAVE_STD_STRING found." means that U_HAVE_STD_STRING is not documented.
* Solution: #define any defines for non @internal API here, so that they are visible in the docs. If you just set PREDEFINED in Doxyfile.in, they won't be documented.
*/
/* None for now. */
#endif
/**
* \def U_PLATFORM
* The U_PLATFORM macro defines the platform we're on.
*
* We used to define one different, value-less macro per platform.
* That made it hard to know the set of relevant platforms and macros,
* and hard to deal with variants of platforms.
*
* Starting with ICU 49, we define platforms as numeric macros,
* with ranges of values for related platforms and their variants.
* The U_PLATFORM macro is set to one of these values.
*
* Historical note from the Solaris Wikipedia article:
* AT&T and Sun collaborated on a project to merge the most popular Unix variants
* on the market at that time: BSD, System V, and Xenix.
* This became Unix System V Release 4 (SVR4).
*
* @internal
*/
/** Unknown platform. @internal */
#define U_PF_UNKNOWN 0
/** Windows @internal */
#define U_PF_WINDOWS 1000
/** MinGW. Windows, calls to Win32 API, but using GNU gcc and binutils. @internal */
#define U_PF_MINGW 1800
/**
* Cygwin. Windows, calls to cygwin1.dll for Posix functions,
* using MSVC or GNU gcc and binutils.
* @internal
*/
#define U_PF_CYGWIN 1900
/* Reserve 2000 for U_PF_UNIX? */
/** HP-UX is based on UNIX System V. @internal */
#define U_PF_HPUX 2100
/** Solaris is a Unix operating system based on SVR4. @internal */
#define U_PF_SOLARIS 2600
/** BSD is a UNIX operating system derivative. @internal */
#define U_PF_BSD 3000
/** AIX is based on UNIX System V Releases and 4.3 BSD. @internal */
#define U_PF_AIX 3100
/** IRIX is based on UNIX System V with BSD extensions. @internal */
#define U_PF_IRIX 3200
/**
* Darwin is a POSIX-compliant operating system, composed of code developed by Apple,
* as well as code derived from NeXTSTEP, BSD, and other projects,
* built around the Mach kernel.
* Darwin forms the core set of components upon which Mac OS X, Apple TV, and iOS are based.
* (Original description modified from WikiPedia.)
* @internal
*/
#define U_PF_DARWIN 3500
/** iPhone OS (iOS) is a derivative of Mac OS X. @internal */
#define U_PF_IPHONE 3550
/** QNX is a commercial Unix-like real-time operating system related to BSD. @internal */
#define U_PF_QNX 3700
/** Linux is a Unix-like operating system. @internal */
#define U_PF_LINUX 4000
/**
* Native Client is pretty close to Linux.
* See https://developer.chrome.com/native-client and
* http://www.chromium.org/nativeclient
* @internal
*/
#define U_PF_BROWSER_NATIVE_CLIENT 4020
/** Android is based on Linux. @internal */
#define U_PF_ANDROID 4050
/** Haiku is a POSIX-ish platform. @internal */
#define U_PF_HAIKU 4080
/** Fuchsia is a POSIX-ish platform. @internal */
#define U_PF_FUCHSIA 4100
/* Maximum value for Linux-based platform is 4499 */
/**
* Emscripten is a C++ transpiler for the Web that can target asm.js or
* WebAssembly. It provides some POSIX-compatible wrappers and stubs and
* some Linux-like functionality, but is not fully compatible with
* either.
* @internal
*/
#define U_PF_EMSCRIPTEN 5010
/** z/OS is the successor to OS/390 which was the successor to MVS. @internal */
#define U_PF_OS390 9000
/** "IBM i" is the current name of what used to be i5/OS and earlier OS/400. @internal */
#define U_PF_OS400 9400
#ifdef U_PLATFORM
/* Use the predefined value. */
#elif defined(__MINGW32__)
# define U_PLATFORM U_PF_MINGW
#elif defined(__CYGWIN__)
# define U_PLATFORM U_PF_CYGWIN
/* Cygwin uchar.h doesn't exist until Cygwin 3.5. */
# include <cygwin/version.h>
#elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
# define U_PLATFORM U_PF_WINDOWS
#elif defined(__ANDROID__)
# define U_PLATFORM U_PF_ANDROID
/* Android wchar_t support depends on the API level. */
# include <android/api-level.h>
#elif defined(__pnacl__) || defined(__native_client__)
# define U_PLATFORM U_PF_BROWSER_NATIVE_CLIENT
#elif defined(__Fuchsia__)
# define U_PLATFORM U_PF_FUCHSIA
#elif defined(linux) || defined(__linux__) || defined(__linux)
# define U_PLATFORM U_PF_LINUX
#elif defined(__APPLE__) && defined(__MACH__)
# include <TargetConditionals.h>
# if (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE) && (defined(TARGET_OS_MACCATALYST) && !TARGET_OS_MACCATALYST) /* variant of TARGET_OS_MAC */
# define U_PLATFORM U_PF_IPHONE
# else
# define U_PLATFORM U_PF_DARWIN
# endif
#elif defined(BSD) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__MirBSD__)
# if defined(__FreeBSD__)
# include <sys/endian.h>
# endif
# define U_PLATFORM U_PF_BSD
#elif defined(sun) || defined(__sun)
/* Check defined(__SVR4) || defined(__svr4__) to distinguish Solaris from SunOS? */
# define U_PLATFORM U_PF_SOLARIS
# if defined(__GNUC__)
/* Solaris/GCC needs this header file to get the proper endianness. Normally, this
* header file is included with stddef.h but on Solairs/GCC, the GCC version of stddef.h
* is included which does not include this header file.
*/
# include <sys/isa_defs.h>
# endif
#elif defined(_AIX) || defined(__TOS_AIX__)
# define U_PLATFORM U_PF_AIX
#elif defined(_hpux) || defined(hpux) || defined(__hpux)
# define U_PLATFORM U_PF_HPUX
#elif defined(sgi) || defined(__sgi)
# define U_PLATFORM U_PF_IRIX
#elif defined(__QNX__) || defined(__QNXNTO__)
# define U_PLATFORM U_PF_QNX
#elif defined(__TOS_MVS__)
# define U_PLATFORM U_PF_OS390
#elif defined(__OS400__) || defined(__TOS_OS400__)
# define U_PLATFORM U_PF_OS400
#elif defined(__HAIKU__)
# define U_PLATFORM U_PF_HAIKU
#elif defined(__EMSCRIPTEN__)
# define U_PLATFORM U_PF_EMSCRIPTEN
#else
# define U_PLATFORM U_PF_UNKNOWN
#endif
/**
* \def U_REAL_MSVC
* Defined if the compiler is the real MSVC compiler (and not something like
* Clang setting _MSC_VER in order to compile Windows code that requires it).
* Otherwise undefined.
* @internal
*/
#if (defined(_MSC_VER) && !(defined(__clang__) && __clang__)) || defined(U_IN_DOXYGEN)
# define U_REAL_MSVC
#endif
/**
* \def CYGWINMSVC
* Defined if this is Windows with Cygwin, but using MSVC rather than gcc.
* Otherwise undefined.
* @internal
*/
/* Commented out because this is already set in mh-cygwin-msvc
#if U_PLATFORM == U_PF_CYGWIN && defined(_MSC_VER)
# define CYGWINMSVC
#endif
*/
#ifdef U_IN_DOXYGEN
# define CYGWINMSVC
#endif
/**
* \def U_PLATFORM_USES_ONLY_WIN32_API
* Defines whether the platform uses only the Win32 API.
* Set to 1 for Windows/MSVC, ClangCL and MinGW but not Cygwin.
* @internal
*/
#ifdef U_PLATFORM_USES_ONLY_WIN32_API
/* Use the predefined value. */
#elif (U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_MINGW) || defined(CYGWINMSVC)
# define U_PLATFORM_USES_ONLY_WIN32_API 1
#else
/* Cygwin implements POSIX. */
# define U_PLATFORM_USES_ONLY_WIN32_API 0
#endif
/**
* \def U_PLATFORM_HAS_WIN32_API
* Defines whether the Win32 API is available on the platform.
* Set to 1 for Windows/MSVC, ClangCL, MinGW and Cygwin.
* @internal
*/
#ifdef U_PLATFORM_HAS_WIN32_API
/* Use the predefined value. */
#elif U_PF_WINDOWS <= U_PLATFORM && U_PLATFORM <= U_PF_CYGWIN
# define U_PLATFORM_HAS_WIN32_API 1
#else
# define U_PLATFORM_HAS_WIN32_API 0
#endif
/**
* \def U_PLATFORM_HAS_WINUWP_API
* Defines whether target is intended for Universal Windows Platform API
* Set to 1 for Windows10 Release Solution Configuration
* @internal
*/
#ifdef U_PLATFORM_HAS_WINUWP_API
/* Use the predefined value. */
#else
# define U_PLATFORM_HAS_WINUWP_API 0
#endif
/**
* \def U_PLATFORM_IMPLEMENTS_POSIX
* Defines whether the platform implements (most of) the POSIX API.
* Set to 1 for Cygwin and most other platforms.
* @internal
*/
#ifdef U_PLATFORM_IMPLEMENTS_POSIX
/* Use the predefined value. */
#elif U_PLATFORM_USES_ONLY_WIN32_API
# define U_PLATFORM_IMPLEMENTS_POSIX 0
#else
# define U_PLATFORM_IMPLEMENTS_POSIX 1
#endif
/**
* \def U_PLATFORM_IS_LINUX_BASED
* Defines whether the platform is Linux or one of its derivatives.
* @internal
*/
#ifdef U_PLATFORM_IS_LINUX_BASED
/* Use the predefined value. */
#elif U_PF_LINUX <= U_PLATFORM && U_PLATFORM <= 4499
# define U_PLATFORM_IS_LINUX_BASED 1
#else
# define U_PLATFORM_IS_LINUX_BASED 0
#endif
/**
* \def U_PLATFORM_IS_DARWIN_BASED
* Defines whether the platform is Darwin or one of its derivatives.
* @internal
*/
#ifdef U_PLATFORM_IS_DARWIN_BASED
/* Use the predefined value. */
#elif U_PF_DARWIN <= U_PLATFORM && U_PLATFORM <= U_PF_IPHONE
# define U_PLATFORM_IS_DARWIN_BASED 1
#else
# define U_PLATFORM_IS_DARWIN_BASED 0
#endif
/*===========================================================================*/
/** @{ Compiler and environment features */
/*===========================================================================*/
/**
* \def U_GCC_MAJOR_MINOR
* Indicates whether the compiler is gcc (test for != 0),
* and if so, contains its major (times 100) and minor version numbers.
* If the compiler is not gcc, then U_GCC_MAJOR_MINOR == 0.
*
* For example, for testing for whether we have gcc, and whether it's 4.6 or higher,
* use "#if U_GCC_MAJOR_MINOR >= 406".
* @internal
*/
#ifdef __GNUC__
# define U_GCC_MAJOR_MINOR (__GNUC__ * 100 + __GNUC_MINOR__)
#else
# define U_GCC_MAJOR_MINOR 0
#endif
/**
* \def U_IS_BIG_ENDIAN
* Determines the endianness of the platform.
* @internal
*/
#ifdef U_IS_BIG_ENDIAN
/* Use the predefined value. */
#elif defined(BYTE_ORDER) && defined(BIG_ENDIAN)
# define U_IS_BIG_ENDIAN (BYTE_ORDER == BIG_ENDIAN)
#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__)
/* gcc */
# define U_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
#elif defined(__BIG_ENDIAN__) || defined(_BIG_ENDIAN)
# define U_IS_BIG_ENDIAN 1
#elif defined(__LITTLE_ENDIAN__) || defined(_LITTLE_ENDIAN)
# define U_IS_BIG_ENDIAN 0
#elif U_PLATFORM == U_PF_OS390 || U_PLATFORM == U_PF_OS400 || defined(__s390__) || defined(__s390x__)
/* These platforms do not appear to predefine any endianness macros. */
# define U_IS_BIG_ENDIAN 1
#elif defined(_PA_RISC1_0) || defined(_PA_RISC1_1) || defined(_PA_RISC2_0)
/* HPPA do not appear to predefine any endianness macros. */
# define U_IS_BIG_ENDIAN 1
#elif defined(sparc) || defined(__sparc) || defined(__sparc__)
/* Some sparc based systems (e.g. Linux) do not predefine any endianness macros. */
# define U_IS_BIG_ENDIAN 1
#else
# define U_IS_BIG_ENDIAN 0
#endif
/**
* \def U_HAVE_PLACEMENT_NEW
* Determines whether to override placement new and delete for STL.
* @stable ICU 2.6
*/
#ifdef U_HAVE_PLACEMENT_NEW
/* Use the predefined value. */
#elif defined(__BORLANDC__)
# define U_HAVE_PLACEMENT_NEW 0
#else
# define U_HAVE_PLACEMENT_NEW 1
#endif
/**
* \def U_HAVE_DEBUG_LOCATION_NEW
* Define this to define the MFC debug version of the operator new.
*
* @stable ICU 3.4
*/
#ifdef U_HAVE_DEBUG_LOCATION_NEW
/* Use the predefined value. */
#elif defined(_MSC_VER)
# define U_HAVE_DEBUG_LOCATION_NEW 1
#else
# define U_HAVE_DEBUG_LOCATION_NEW 0
#endif
/* Compatibility with compilers other than clang: http://clang.llvm.org/docs/LanguageExtensions.html */
#ifdef __has_attribute
# define UPRV_HAS_ATTRIBUTE(x) __has_attribute(x)
#else
# define UPRV_HAS_ATTRIBUTE(x) 0
#endif
#ifdef __has_cpp_attribute
# define UPRV_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
#else
# define UPRV_HAS_CPP_ATTRIBUTE(x) 0
#endif
#ifdef __has_declspec_attribute
# define UPRV_HAS_DECLSPEC_ATTRIBUTE(x) __has_declspec_attribute(x)
#else
# define UPRV_HAS_DECLSPEC_ATTRIBUTE(x) 0
#endif
#ifdef __has_builtin
# define UPRV_HAS_BUILTIN(x) __has_builtin(x)
#else
# define UPRV_HAS_BUILTIN(x) 0
#endif
#ifdef __has_feature
# define UPRV_HAS_FEATURE(x) __has_feature(x)
#else
# define UPRV_HAS_FEATURE(x) 0
#endif
#ifdef __has_extension
# define UPRV_HAS_EXTENSION(x) __has_extension(x)
#else
# define UPRV_HAS_EXTENSION(x) 0
#endif
#ifdef __has_warning
# define UPRV_HAS_WARNING(x) __has_warning(x)
#else
# define UPRV_HAS_WARNING(x) 0
#endif
#if defined(__clang__)
#define UPRV_NO_SANITIZE_UNDEFINED __attribute__((no_sanitize("undefined")))
#else
#define UPRV_NO_SANITIZE_UNDEFINED
#endif
/**
* \def U_MALLOC_ATTR
* Attribute to mark functions as malloc-like
* @internal
*/
#if defined(__GNUC__) && __GNUC__>=3
# define U_MALLOC_ATTR __attribute__ ((__malloc__))
#else
# define U_MALLOC_ATTR
#endif
/**
* \def U_ALLOC_SIZE_ATTR
* Attribute to specify the size of the allocated buffer for malloc-like functions
* @internal
*/
#if (defined(__GNUC__) && \
(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) || \
UPRV_HAS_ATTRIBUTE(alloc_size)
# define U_ALLOC_SIZE_ATTR(X) __attribute__ ((alloc_size(X)))
# define U_ALLOC_SIZE_ATTR2(X,Y) __attribute__ ((alloc_size(X,Y)))
#else
# define U_ALLOC_SIZE_ATTR(X)
# define U_ALLOC_SIZE_ATTR2(X,Y)
#endif
/**
* \def U_CPLUSPLUS_VERSION
* 0 if no C++; 1, 11, 14, ... if C++.
* Support for specific features cannot always be determined by the C++ version alone.
* @internal
*/
#ifdef U_CPLUSPLUS_VERSION
# if U_CPLUSPLUS_VERSION != 0 && !defined(__cplusplus)
# undef U_CPLUSPLUS_VERSION
# define U_CPLUSPLUS_VERSION 0
# endif
/* Otherwise use the predefined value. */
#elif !defined(__cplusplus)
# define U_CPLUSPLUS_VERSION 0
#elif __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
# define U_CPLUSPLUS_VERSION 17
#elif __cplusplus >= 201402L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L)
# define U_CPLUSPLUS_VERSION 14
#elif __cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)
# define U_CPLUSPLUS_VERSION 11
#else
// C++98 or C++03
# define U_CPLUSPLUS_VERSION 1
#endif
/**
* \def U_FALLTHROUGH
* Annotate intentional fall-through between switch labels.
* http://clang.llvm.org/docs/AttributeReference.html#fallthrough-clang-fallthrough
* @internal
*/
#ifndef __cplusplus
// Not for C.
#elif defined(U_FALLTHROUGH)
// Use the predefined value.
#elif defined(__clang__)
// Test for compiler vs. feature separately.
// Other compilers might choke on the feature test.
# if UPRV_HAS_CPP_ATTRIBUTE(clang::fallthrough) || \
(UPRV_HAS_FEATURE(cxx_attributes) && \
UPRV_HAS_WARNING("-Wimplicit-fallthrough"))
# define U_FALLTHROUGH [[clang::fallthrough]]
# endif
#elif defined(__GNUC__) && (__GNUC__ >= 7)
# define U_FALLTHROUGH __attribute__((fallthrough))
#endif
#ifndef U_FALLTHROUGH
# define U_FALLTHROUGH
#endif
/** @} */
/*===========================================================================*/
/** @{ Character data types */
/*===========================================================================*/
/**
* U_CHARSET_FAMILY is equal to this value when the platform is an ASCII based platform.
* @stable ICU 2.0
*/
#define U_ASCII_FAMILY 0
/**
* U_CHARSET_FAMILY is equal to this value when the platform is an EBCDIC based platform.
* @stable ICU 2.0
*/
#define U_EBCDIC_FAMILY 1
/**
* \def U_CHARSET_FAMILY
*
* <p>These definitions allow to specify the encoding of text
* in the char data type as defined by the platform and the compiler.
* It is enough to determine the code point values of "invariant characters",
* which are the ones shared by all encodings that are in use
* on a given platform.</p>
*
* <p>Those "invariant characters" should be all the uppercase and lowercase
* latin letters, the digits, the space, and "basic punctuation".
* Also, '\\n', '\\r', '\\t' should be available.</p>
*
* <p>The list of "invariant characters" is:<br>
* \code
* A-Z a-z 0-9 SPACE " % &amp; ' ( ) * + , - . / : ; < = > ? _
* \endcode
* <br>
* (52 letters + 10 numbers + 20 punc/sym/space = 82 total)</p>
*
* <p>This matches the IBM Syntactic Character Set (CS 640).</p>
*
* <p>In other words, all the graphic characters in 7-bit ASCII should
* be safely accessible except the following:</p>
*
* \code
* '\' <backslash>
* '[' <left bracket>
* ']' <right bracket>
* '{' <left brace>
* '}' <right brace>
* '^' <circumflex>
* '~' <tilde>
* '!' <exclamation mark>
* '#' <number sign>
* '|' <vertical line>
* '$' <dollar sign>
* '@' <commercial at>
* '`' <grave accent>
* \endcode
* @stable ICU 2.0
*/
#ifdef U_CHARSET_FAMILY
/* Use the predefined value. */
#elif U_PLATFORM == U_PF_OS390 && (!defined(__CHARSET_LIB) || !__CHARSET_LIB)
# define U_CHARSET_FAMILY U_EBCDIC_FAMILY
#elif U_PLATFORM == U_PF_OS400 && !defined(__UTF32__)
# define U_CHARSET_FAMILY U_EBCDIC_FAMILY
#else
# define U_CHARSET_FAMILY U_ASCII_FAMILY
#endif
/**
* \def U_CHARSET_IS_UTF8
*
* Hardcode the default charset to UTF-8.
*
* If this is set to 1, then
* - ICU will assume that all non-invariant char*, StringPiece, std::string etc.
* contain UTF-8 text, regardless of what the system API uses
* - some ICU code will use fast functions like u_strFromUTF8()
* rather than the more general and more heavy-weight conversion API (ucnv.h)
* - ucnv_getDefaultName() always returns "UTF-8"
* - ucnv_setDefaultName() is disabled and will not change the default charset
* - static builds of ICU are smaller
* - more functionality is available with the UCONFIG_NO_CONVERSION build-time
* configuration option (see unicode/uconfig.h)
* - the UCONFIG_NO_CONVERSION build option in uconfig.h is more usable
*
* @stable ICU 4.2
* @see UCONFIG_NO_CONVERSION
*/
#ifdef U_CHARSET_IS_UTF8
/* Use the predefined value. */
#elif U_PLATFORM_IS_LINUX_BASED || U_PLATFORM_IS_DARWIN_BASED || \
U_PLATFORM == U_PF_EMSCRIPTEN
# define U_CHARSET_IS_UTF8 1
#else
# define U_CHARSET_IS_UTF8 0
#endif
/** @} */
/*===========================================================================*/
/** @{ Information about wchar support */
/*===========================================================================*/
/**
* \def U_HAVE_WCHAR_H
* Indicates whether <wchar.h> is available (1) or not (0). Set to 1 by default.
*
* @stable ICU 2.0
*/
#ifdef U_HAVE_WCHAR_H
/* Use the predefined value. */
#elif U_PLATFORM == U_PF_ANDROID && __ANDROID_API__ < 9
/*
* Android before Gingerbread (Android 2.3, API level 9) did not support wchar_t.
* The type and header existed, but the library functions did not work as expected.
* The size of wchar_t was 1 but L"xyz" string literals had 32-bit units anyway.
*/
# define U_HAVE_WCHAR_H 0
#else
# define U_HAVE_WCHAR_H 1
#endif
/**
* \def U_SIZEOF_WCHAR_T
* U_SIZEOF_WCHAR_T==sizeof(wchar_t)
*
* @stable ICU 2.0
*/
#ifdef U_SIZEOF_WCHAR_T
/* Use the predefined value. */
#elif (U_PLATFORM == U_PF_ANDROID && __ANDROID_API__ < 9)
/*
* Classic Mac OS and Mac OS X before 10.3 (Panther) did not support wchar_t or wstring.
* Newer Mac OS X has size 4.
*/
# define U_SIZEOF_WCHAR_T 1
#elif U_PLATFORM_HAS_WIN32_API || U_PLATFORM == U_PF_CYGWIN
# define U_SIZEOF_WCHAR_T 2
#elif U_PLATFORM == U_PF_AIX
/*
* AIX 6.1 information, section "Wide character data representation":
* "... the wchar_t datatype is 32-bit in the 64-bit environment and
* 16-bit in the 32-bit environment."
* and
* "All locales use Unicode for their wide character code values (process code),
* except the IBM-eucTW codeset."
*/
# ifdef __64BIT__
# define U_SIZEOF_WCHAR_T 4
# else
# define U_SIZEOF_WCHAR_T 2
# endif
#elif U_PLATFORM == U_PF_OS390
/*
* z/OS V1R11 information center, section "LP64 | ILP32":
* "In 31-bit mode, the size of long and pointers is 4 bytes and the size of wchar_t is 2 bytes.
* Under LP64, the size of long and pointer is 8 bytes and the size of wchar_t is 4 bytes."
*/
# ifdef _LP64
# define U_SIZEOF_WCHAR_T 4
# else
# define U_SIZEOF_WCHAR_T 2
# endif
#elif U_PLATFORM == U_PF_OS400
# if defined(__UTF32__)
/*
* LOCALETYPE(*LOCALEUTF) is specified.
* Wide-character strings are in UTF-32,
* narrow-character strings are in UTF-8.
*/
# define U_SIZEOF_WCHAR_T 4
# elif defined(__UCS2__)
/*
* LOCALETYPE(*LOCALEUCS2) is specified.
* Wide-character strings are in UCS-2,
* narrow-character strings are in EBCDIC.
*/
# define U_SIZEOF_WCHAR_T 2
# else
/*
* LOCALETYPE(*CLD) or LOCALETYPE(*LOCALE) is specified.
* Wide-character strings are in 16-bit EBCDIC,
* narrow-character strings are in EBCDIC.
*/
# define U_SIZEOF_WCHAR_T 2
# endif
#else
# define U_SIZEOF_WCHAR_T 4
#endif
#ifndef U_HAVE_WCSCPY
#define U_HAVE_WCSCPY U_HAVE_WCHAR_H
#endif
/** @} */
/**
* \def U_HAVE_CHAR16_T
* Defines whether the char16_t type is available for UTF-16
* and u"abc" UTF-16 string literals are supported.
* This is a new standard type and standard string literal syntax in C++11
* but has been available in some compilers before.
* @internal
*/
#ifdef U_HAVE_CHAR16_T
/* Use the predefined value. */
#else
/*
* Notes:
* C++11 and C11 require support for UTF-16 literals
* Doesn't work on Mac C11 (see workaround in ptypes.h)
* or Cygwin less than 3.5.
*/
# if defined(__cplusplus)
# define U_HAVE_CHAR16_T 1
# elif U_PLATFORM_IS_DARWIN_BASED || (U_PLATFORM == U_PF_CYGWIN && CYGWIN_VERSION_DLL_MAJOR < 3005)
# define U_HAVE_CHAR16_T 0
# else
// conformant C11
# define U_HAVE_CHAR16_T 1
# endif
#endif
/**
* @{
* \def U_DECLARE_UTF16
* Do not use this macro because it is not defined on all platforms.
* In C++, use std::u16string_view literals, see the UNICODE_STRING docs.
* In C, use u"UTF-16 literals".
* See also the public U_STRING_DECL macro.
* @internal
*/
#ifdef U_DECLARE_UTF16
/* Use the predefined value. */
#elif U_HAVE_CHAR16_T \
|| (defined(__xlC__) && defined(__IBM_UTF_LITERAL) && U_SIZEOF_WCHAR_T != 2) \
|| (defined(__HP_aCC) && __HP_aCC >= 035000) \
|| (defined(__HP_cc) && __HP_cc >= 111106) \
|| (defined(U_IN_DOXYGEN))
# define U_DECLARE_UTF16(string) u ## string
#elif U_SIZEOF_WCHAR_T == 2 \
&& (U_CHARSET_FAMILY == 0 || (U_PF_OS390 <= U_PLATFORM && U_PLATFORM <= U_PF_OS400 && defined(__UCS2__)))
# define U_DECLARE_UTF16(string) L ## string
#else
/* Leave U_DECLARE_UTF16 undefined. See unistr.h. */
#endif
/** @} */
/*===========================================================================*/
/** @{ Symbol import-export control */
/*===========================================================================*/
#ifdef U_EXPORT
/* Use the predefined value. */
#elif defined(U_STATIC_IMPLEMENTATION)
# define U_EXPORT
#elif defined(_MSC_VER) || (UPRV_HAS_DECLSPEC_ATTRIBUTE(__dllexport__) && \
UPRV_HAS_DECLSPEC_ATTRIBUTE(__dllimport__))
# define U_EXPORT __declspec(dllexport)
#elif defined(__GNUC__) || defined(__open_xl__)
# define U_EXPORT __attribute__((visibility("default")))
#elif (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x550) \
|| (defined(__SUNPRO_C) && __SUNPRO_C >= 0x550)
# define U_EXPORT __global
/*#elif defined(__HP_aCC) || defined(__HP_cc)
# define U_EXPORT __declspec(dllexport)*/
#else
# define U_EXPORT
#endif
/* U_CALLCONV is related to U_EXPORT2 */
#ifdef U_EXPORT2
/* Use the predefined value. */
#elif defined(_MSC_VER)
# define U_EXPORT2 __cdecl
#else
# define U_EXPORT2
#endif
#ifdef U_IMPORT
/* Use the predefined value. */
#elif defined(_MSC_VER) || (UPRV_HAS_DECLSPEC_ATTRIBUTE(__dllexport__) && \
UPRV_HAS_DECLSPEC_ATTRIBUTE(__dllimport__))
/* Windows needs to export/import data. */
# define U_IMPORT __declspec(dllimport)
#else
# define U_IMPORT
#endif
/**
* \def U_HIDDEN
* This is used to mark internal structs declared within external classes,
* to prevent the internal structs from having the same visibility as the
* class within which they are declared.
* @internal
*/
#ifdef U_HIDDEN
/* Use the predefined value. */
#elif defined(__GNUC__) || defined(__open_xl__)
# define U_HIDDEN __attribute__((visibility("hidden")))
#else
# define U_HIDDEN
#endif
/**
* \def U_CALLCONV
* Similar to U_CDECL_BEGIN/U_CDECL_END, this qualifier is necessary
* in callback function typedefs to make sure that the calling convention
* is compatible.
*
* This is only used for non-ICU-API functions.
* When a function is a public ICU API,
* you must use the U_CAPI and U_EXPORT2 qualifiers.
*
* Please note, you need to use U_CALLCONV after the *.
*
* NO : "static const char U_CALLCONV *func( . . . )"
* YES: "static const char* U_CALLCONV func( . . . )"
*
* @stable ICU 2.0
*/
#if U_PLATFORM == U_PF_OS390 && defined(__cplusplus)
# define U_CALLCONV __cdecl
#else
# define U_CALLCONV U_EXPORT2
#endif
/**
* \def U_CALLCONV_FPTR
* Similar to U_CALLCONV, but only used on function pointers.
* @internal
*/
#if U_PLATFORM == U_PF_OS390 && defined(__cplusplus)
# define U_CALLCONV_FPTR U_CALLCONV
#else
# define U_CALLCONV_FPTR
#endif
/** @} */
#endif // _PLATFORM_H

View File

@@ -0,0 +1,66 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1997-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*
* FILE NAME : ptypes.h
*
* Date Name Description
* 05/13/98 nos Creation (content moved here from ptypes.h).
* 03/02/99 stephen Added AS400 support.
* 03/30/99 stephen Added Linux support.
* 04/13/99 stephen Reworked for autoconf.
* 09/18/08 srl Moved basic types back to ptypes.h from platform.h
******************************************************************************
*/
/**
* \file
* \brief C API: Definitions of integer types of various widths
*/
#ifndef _PTYPES_H
#define _PTYPES_H
/**
* \def __STDC_LIMIT_MACROS
* According to the Linux stdint.h, the ISO C99 standard specifies that in C++ implementations
* macros like INT32_MIN and UINTPTR_MAX should only be defined if explicitly requested.
* We need to define __STDC_LIMIT_MACROS before including stdint.h in C++ code
* that uses such limit macros.
* @internal
*/
#ifndef __STDC_LIMIT_MACROS
#define __STDC_LIMIT_MACROS
#endif
/* NULL, size_t, wchar_t */
#include <stddef.h>
/* More platform-specific definitions. */
#include "unicode/platform.h"
/*===========================================================================*/
/* Generic data types */
/*===========================================================================*/
#include <stdint.h>
// C++11 and C11 both specify that the data type char16_t should exist, C++11
// as a keyword and C11 as a typedef in the uchar.h header file, but not all
// implementations (looking at you, Apple, spring 2024) actually do this, so
// ICU4C must detect and deal with that.
#if !defined(__cplusplus) && !defined(U_IN_DOXYGEN)
# if U_HAVE_CHAR16_T
# include <uchar.h>
# else
typedef uint16_t char16_t;
# endif
#endif
#endif /* _PTYPES_H */

183
thirdparty/icu4c/common/unicode/putil.h vendored Normal file
View File

@@ -0,0 +1,183 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1997-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*
* FILE NAME : putil.h
*
* Date Name Description
* 05/14/98 nos Creation (content moved here from utypes.h).
* 06/17/99 erm Added IEEE_754
* 07/22/98 stephen Added IEEEremainder, max, min, trunc
* 08/13/98 stephen Added isNegativeInfinity, isPositiveInfinity
* 08/24/98 stephen Added longBitsFromDouble
* 03/02/99 stephen Removed openFile(). Added AS400 support.
* 04/15/99 stephen Converted to C
* 11/15/99 helena Integrated S/390 changes for IEEE support.
* 01/11/00 helena Added u_getVersion.
******************************************************************************
*/
#ifndef PUTIL_H
#define PUTIL_H
#include "unicode/utypes.h"
/**
* \file
* \brief C API: Platform Utilities
*/
/*==========================================================================*/
/* Platform utilities */
/*==========================================================================*/
/**
* Platform utilities isolates the platform dependencies of the
* library. For each platform which this code is ported to, these
* functions may have to be re-implemented.
*/
/**
* Return the ICU data directory.
* The data directory is where common format ICU data files (.dat files)
* are loaded from. Note that normal use of the built-in ICU
* facilities does not require loading of an external data file;
* unless you are adding custom data to ICU, the data directory
* does not need to be set.
*
* The data directory is determined as follows:
* If u_setDataDirectory() has been called, that is it, otherwise
* if the ICU_DATA environment variable is set, use that, otherwise
* If a data directory was specified at ICU build time
* <code>
* \code
* #define ICU_DATA_DIR "path"
* \endcode
* </code> use that,
* otherwise no data directory is available.
*
* @return the data directory, or an empty string ("") if no data directory has
* been specified.
*
* @stable ICU 2.0
*/
U_CAPI const char* U_EXPORT2 u_getDataDirectory(void);
/**
* Set the ICU data directory.
* The data directory is where common format ICU data files (.dat files)
* are loaded from. Note that normal use of the built-in ICU
* facilities does not require loading of an external data file;
* unless you are adding custom data to ICU, the data directory
* does not need to be set.
*
* This function should be called at most once in a process, before the
* first ICU operation (e.g., u_init()) that will require the loading of an
* ICU data file.
* This function is not thread-safe. Use it before calling ICU APIs from
* multiple threads.
*
* @param directory The directory to be set.
*
* @see u_init
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 u_setDataDirectory(const char *directory);
#ifndef U_HIDE_INTERNAL_API
/**
* Return the time zone files override directory, or an empty string if
* no directory was specified. Certain time zone resources will be preferentially
* loaded from individual files in this directory.
*
* @return the time zone data override directory.
* @internal
*/
U_CAPI const char * U_EXPORT2 u_getTimeZoneFilesDirectory(UErrorCode *status);
/**
* Set the time zone files override directory.
* This function is not thread safe; it must not be called concurrently with
* u_getTimeZoneFilesDirectory() or any other use of ICU time zone functions.
* This function should only be called before using any ICU service that
* will access the time zone data.
* @internal
*/
U_CAPI void U_EXPORT2 u_setTimeZoneFilesDirectory(const char *path, UErrorCode *status);
#endif /* U_HIDE_INTERNAL_API */
/**
* @{
* Filesystem file and path separator characters.
* Example: '/' and ':' on Unix, '\\' and ';' on Windows.
* @stable ICU 2.0
*/
#if U_PLATFORM_USES_ONLY_WIN32_API
# define U_FILE_SEP_CHAR '\\'
# define U_FILE_ALT_SEP_CHAR '/'
# define U_PATH_SEP_CHAR ';'
# define U_FILE_SEP_STRING "\\"
# define U_FILE_ALT_SEP_STRING "/"
# define U_PATH_SEP_STRING ";"
#else
# define U_FILE_SEP_CHAR '/'
# define U_FILE_ALT_SEP_CHAR '/'
# define U_PATH_SEP_CHAR ':'
# define U_FILE_SEP_STRING "/"
# define U_FILE_ALT_SEP_STRING "/"
# define U_PATH_SEP_STRING ":"
#endif
/** @} */
/**
* Convert char characters to UChar characters.
* This utility function is useful only for "invariant characters"
* that are encoded in the platform default encoding.
* They are a small, constant subset of the encoding and include
* just the latin letters, digits, and some punctuation.
* For details, see U_CHARSET_FAMILY.
*
* @param cs Input string, points to <code>length</code>
* character bytes from a subset of the platform encoding.
* @param us Output string, points to memory for <code>length</code>
* Unicode characters.
* @param length The number of characters to convert; this may
* include the terminating <code>NUL</code>.
*
* @see U_CHARSET_FAMILY
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2
u_charsToUChars(const char *cs, UChar *us, int32_t length);
/**
* Convert UChar characters to char characters.
* This utility function is useful only for "invariant characters"
* that can be encoded in the platform default encoding.
* They are a small, constant subset of the encoding and include
* just the latin letters, digits, and some punctuation.
* For details, see U_CHARSET_FAMILY.
*
* @param us Input string, points to <code>length</code>
* Unicode characters that can be encoded with the
* codepage-invariant subset of the platform encoding.
* @param cs Output string, points to memory for <code>length</code>
* character bytes.
* @param length The number of characters to convert; this may
* include the terminating <code>NUL</code>.
*
* @see U_CHARSET_FAMILY
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2
u_UCharsToChars(const UChar *us, char *cs, int32_t length);
#endif

823
thirdparty/icu4c/common/unicode/rbbi.h vendored Normal file
View File

@@ -0,0 +1,823 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
***************************************************************************
* Copyright (C) 1999-2016 International Business Machines Corporation *
* and others. All rights reserved. *
***************************************************************************
**********************************************************************
* Date Name Description
* 10/22/99 alan Creation.
* 11/11/99 rgillam Complete port from Java.
**********************************************************************
*/
#ifndef RBBI_H
#define RBBI_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
/**
* \file
* \brief C++ API: Rule Based Break Iterator
*/
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/brkiter.h"
#include "unicode/udata.h"
#include "unicode/parseerr.h"
#include "unicode/schriter.h"
struct UCPTrie;
U_NAMESPACE_BEGIN
/** @internal */
class LanguageBreakEngine;
struct RBBIDataHeader;
class RBBIDataWrapper;
class UnhandledEngine;
class UStack;
#ifndef U_HIDE_INTERNAL_API
/**
* The ExternalBreakEngine class define an abstract interface for the host environment
* to provide a low level facility to break text for unicode text in script that the text boundary
* cannot be handled by upper level rule based logic, for example, for Chinese and Japanese
* word breaking, Thai, Khmer, Burmese, Lao and other Southeast Asian scripts.
* The host environment implement one or more subclass of ExternalBreakEngine and
* register them in the initialization time by calling
* RuleBasedBreakIterator::registerExternalBreakEngine(). ICU adopt and own the engine and will
* delete the registered external engine in proper time during the clean up
* event.
* @internal ICU 74 technology preview
*/
class ExternalBreakEngine : public UObject {
public:
/**
* destructor
* @internal ICU 74 technology preview
*/
virtual ~ExternalBreakEngine() {}
/**
* <p>Indicate whether this engine handles a particular character when
* the RuleBasedBreakIterator is used for a particular locale. This method is used
* by the RuleBasedBreakIterator to find a break engine.</p>
* @param c A character which begins a run that the engine might handle.
* @param locale The locale.
* @return true if this engine handles the particular character for that locale.
* @internal ICU 74 technology preview
*/
virtual bool isFor(UChar32 c, const char* locale) const = 0;
/**
* <p>Indicate whether this engine handles a particular character.This method is
* used by the RuleBasedBreakIterator after it already find a break engine to see which
* characters after the first one can be handled by this break engine.</p>
* @param c A character that the engine might handle.
* @return true if this engine handles the particular character.
* @internal ICU 74 technology preview
*/
virtual bool handles(UChar32 c) const = 0;
/**
* <p>Divide up a range of text handled by this break engine.</p>
*
* @param text A UText representing the text
* @param start The start of the range of known characters
* @param end The end of the range of known characters
* @param foundBreaks Output of C array of int32_t break positions, or
* nullptr
* @param foundBreaksCapacity The capacity of foundBreaks
* @param status Information on any errors encountered.
* @return The number of breaks found
* @internal ICU 74 technology preview
*/
virtual int32_t fillBreaks(UText* text, int32_t start, int32_t end,
int32_t* foundBreaks, int32_t foundBreaksCapacity,
UErrorCode& status) const = 0;
};
#endif /* U_HIDE_INTERNAL_API */
/**
*
* A subclass of BreakIterator whose behavior is specified using a list of rules.
* <p>Instances of this class are most commonly created by the factory methods of
* BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc.,
* and then used via the abstract API in class BreakIterator</p>
*
* <p>See the ICU User Guide for information on Break Iterator Rules.</p>
*
* <p>This class is not intended to be subclassed.</p>
*/
class U_COMMON_API RuleBasedBreakIterator /*final*/ : public BreakIterator {
private:
/**
* The UText through which this BreakIterator accesses the text
* @internal (private)
*/
UText fText = UTEXT_INITIALIZER;
#ifndef U_HIDE_INTERNAL_API
public:
#endif /* U_HIDE_INTERNAL_API */
/**
* The rule data for this BreakIterator instance.
* Not for general use; Public only for testing purposes.
* @internal
*/
RBBIDataWrapper *fData = nullptr;
private:
/**
* The saved error code associated with this break iterator.
* This is the value to be returned by copyErrorTo().
*/
UErrorCode fErrorCode = U_ZERO_ERROR;
/**
* The current position of the iterator. Pinned, 0 < fPosition <= text.length.
* Never has the value UBRK_DONE (-1).
*/
int32_t fPosition = 0;
/**
* TODO:
*/
int32_t fRuleStatusIndex = 0;
/**
* Cache of previously determined boundary positions.
*/
class BreakCache;
BreakCache *fBreakCache = nullptr;
/**
* Cache of boundary positions within a region of text that has been
* sub-divided by dictionary based breaking.
*/
class DictionaryCache;
DictionaryCache *fDictionaryCache = nullptr;
/**
*
* If present, UStack of LanguageBreakEngine objects that might handle
* dictionary characters. Searched from top to bottom to find an object to
* handle a given character.
* @internal (private)
*/
UStack *fLanguageBreakEngines = nullptr;
/**
*
* If present, the special LanguageBreakEngine used for handling
* characters that are in the dictionary set, but not handled by any
* LanguageBreakEngine.
* @internal (private)
*/
UnhandledEngine *fUnhandledBreakEngine = nullptr;
/**
* Counter for the number of characters encountered with the "dictionary"
* flag set.
* @internal (private)
*/
uint32_t fDictionaryCharCount = 0;
/**
* A character iterator that refers to the same text as the UText, above.
* Only included for compatibility with old API, which was based on CharacterIterators.
* Value may be adopted from outside, or one of fSCharIter or fDCharIter, below.
*/
CharacterIterator *fCharIter = &fSCharIter;
/**
* When the input text is provided by a UnicodeString, this will point to
* a characterIterator that wraps that data. Needed only for the
* implementation of getText(), a backwards compatibility issue.
*/
UCharCharacterIterator fSCharIter {u"", 0};
/**
* True when iteration has run off the end, and iterator functions should return UBRK_DONE.
*/
bool fDone = false;
/**
* Array of look-ahead tentative results.
*/
int32_t *fLookAheadMatches = nullptr;
/**
* A flag to indicate if phrase based breaking is enabled.
*/
UBool fIsPhraseBreaking = false;
//=======================================================================
// constructors
//=======================================================================
/**
* Constructor from a flattened set of RBBI data in malloced memory.
* RulesBasedBreakIterators built from a custom set of rules
* are created via this constructor; the rules are compiled
* into memory, then the break iterator is constructed here.
*
* The break iterator adopts the memory, and will
* free it when done.
* @internal (private)
*/
RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
/**
* This constructor uses the udata interface to create a BreakIterator
* whose internal tables live in a memory-mapped file. "image" is an
* ICU UDataMemory handle for the pre-compiled break iterator tables.
* @param image handle to the memory image for the break iterator data.
* Ownership of the UDataMemory handle passes to the Break Iterator,
* which will be responsible for closing it when it is no longer needed.
* @param status Information on any errors encountered.
* @param isPhraseBreaking true if phrase based breaking is required, otherwise false.
* @see udata_open
* @see #getBinaryRules
* @internal (private)
*/
RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status);
/** @internal */
friend class RBBIRuleBuilder;
/** @internal */
friend class BreakIterator;
/**
* Default constructor with an error code parameter.
* Aside from error handling, otherwise identical to the default constructor.
* Internally, handles common initialization for other constructors.
* @internal (private)
*/
RuleBasedBreakIterator(UErrorCode *status);
public:
/** Default constructor. Creates an empty shell of an iterator, with no
* rules or text to iterate over. Object can subsequently be assigned to,
* but is otherwise unusable.
* @stable ICU 2.2
*/
RuleBasedBreakIterator();
/**
* Copy constructor. Will produce a break iterator with the same behavior,
* and which iterates over the same text, as the one passed in.
* @param that The RuleBasedBreakIterator passed to be copied
* @stable ICU 2.0
*/
RuleBasedBreakIterator(const RuleBasedBreakIterator& that);
/**
* Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
* @param rules The break rules to be used.
* @param parseError In the event of a syntax error in the rules, provides the location
* within the rules of the problem.
* @param status Information on any errors encountered.
* @stable ICU 2.2
*/
RuleBasedBreakIterator( const UnicodeString &rules,
UParseError &parseError,
UErrorCode &status);
/**
* Construct a RuleBasedBreakIterator from a set of precompiled binary rules.
* Binary rules are obtained from RulesBasedBreakIterator::getBinaryRules().
* Construction of a break iterator in this way is substantially faster than
* construction from source rules.
*
* Ownership of the storage containing the compiled rules remains with the
* caller of this function. The compiled rules must not be modified or
* deleted during the life of the break iterator.
*
* The compiled rules are not compatible across different major versions of ICU.
* The compiled rules are compatible only between machines with the same
* byte ordering (little or big endian) and the same base character set family
* (ASCII or EBCDIC).
*
* @see #getBinaryRules
* @param compiledRules A pointer to the compiled break rules to be used.
* @param ruleLength The length of the compiled break rules, in bytes. This
* corresponds to the length value produced by getBinaryRules().
* @param status Information on any errors encountered, including invalid
* binary rules.
* @stable ICU 4.8
*/
RuleBasedBreakIterator(const uint8_t *compiledRules,
uint32_t ruleLength,
UErrorCode &status);
/**
* This constructor uses the udata interface to create a BreakIterator
* whose internal tables live in a memory-mapped file. "image" is an
* ICU UDataMemory handle for the pre-compiled break iterator tables.
* @param image handle to the memory image for the break iterator data.
* Ownership of the UDataMemory handle passes to the Break Iterator,
* which will be responsible for closing it when it is no longer needed.
* @param status Information on any errors encountered.
* @see udata_open
* @see #getBinaryRules
* @stable ICU 2.8
*/
RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status);
/**
* Destructor
* @stable ICU 2.0
*/
virtual ~RuleBasedBreakIterator();
/**
* Assignment operator. Sets this iterator to have the same behavior,
* and iterate over the same text, as the one passed in.
* @param that The RuleBasedBreakItertor passed in
* @return the newly created RuleBasedBreakIterator
* @stable ICU 2.0
*/
RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);
/**
* Equality operator. Returns true if both BreakIterators are of the
* same class, have the same behavior, and iterate over the same text.
* @param that The BreakIterator to be compared for equality
* @return true if both BreakIterators are of the
* same class, have the same behavior, and iterate over the same text.
* @stable ICU 2.0
*/
virtual bool operator==(const BreakIterator& that) const override;
/**
* Not-equal operator. If operator== returns true, this returns false,
* and vice versa.
* @param that The BreakIterator to be compared for inequality
* @return true if both BreakIterators are not same.
* @stable ICU 2.0
*/
inline bool operator!=(const BreakIterator& that) const {
return !operator==(that);
}
/**
* Returns a newly-constructed RuleBasedBreakIterator with the same
* behavior, and iterating over the same text, as this one.
* Differs from the copy constructor in that it is polymorphic, and
* will correctly clone (copy) a derived class.
* clone() is thread safe. Multiple threads may simultaneously
* clone the same source break iterator.
* @return a newly-constructed RuleBasedBreakIterator
* @stable ICU 2.0
*/
virtual RuleBasedBreakIterator* clone() const override;
/**
* Compute a hash code for this BreakIterator
* @return A hash code
* @stable ICU 2.0
*/
virtual int32_t hashCode() const;
/**
* Returns the description used to create this iterator
* @return the description used to create this iterator
* @stable ICU 2.0
*/
virtual const UnicodeString& getRules() const;
//=======================================================================
// BreakIterator overrides
//=======================================================================
/**
* <p>
* Return a CharacterIterator over the text being analyzed.
* The returned character iterator is owned by the break iterator, and must
* not be deleted by the caller. Repeated calls to this function may
* return the same CharacterIterator.
* </p>
* <p>
* The returned character iterator must not be used concurrently with
* the break iterator. If concurrent operation is needed, clone the
* returned character iterator first and operate on the clone.
* </p>
* <p>
* When the break iterator is operating on text supplied via a UText,
* this function will fail, returning a CharacterIterator containing no text.
* The function getUText() provides similar functionality,
* is reliable, and is more efficient.
* </p>
*
* TODO: deprecate this function?
*
* @return An iterator over the text being analyzed.
* @stable ICU 2.0
*/
virtual CharacterIterator& getText() const override;
/**
* Get a UText for the text being analyzed.
* The returned UText is a shallow clone of the UText used internally
* by the break iterator implementation. It can safely be used to
* access the text without impacting any break iterator operations,
* but the underlying text itself must not be altered.
*
* @param fillIn A UText to be filled in. If nullptr, a new UText will be
* allocated to hold the result.
* @param status receives any error codes.
* @return The current UText for this break iterator. If an input
* UText was provided, it will always be returned.
* @stable ICU 3.4
*/
virtual UText *getUText(UText *fillIn, UErrorCode &status) const override;
/**
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
* @param newText An iterator over the text to analyze. The BreakIterator
* takes ownership of the character iterator. The caller MUST NOT delete it!
* @stable ICU 2.0
*/
virtual void adoptText(CharacterIterator* newText) override;
/**
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
*
* The BreakIterator will retain a reference to the supplied string.
* The caller must not modify or delete the text while the BreakIterator
* retains the reference.
*
* @param newText The text to analyze.
* @stable ICU 2.0
*/
virtual void setText(const UnicodeString& newText) override;
/**
* Reset the break iterator to operate over the text represented by
* the UText. The iterator position is reset to the start.
*
* This function makes a shallow clone of the supplied UText. This means
* that the caller is free to immediately close or otherwise reuse the
* Utext that was passed as a parameter, but that the underlying text itself
* must not be altered while being referenced by the break iterator.
*
* @param text The UText used to change the text.
* @param status Receives any error codes.
* @stable ICU 3.4
*/
virtual void setText(UText *text, UErrorCode &status) override;
/**
* Sets the current iteration position to the beginning of the text, position zero.
* @return The offset of the beginning of the text, zero.
* @stable ICU 2.0
*/
virtual int32_t first() override;
/**
* Sets the current iteration position to the end of the text.
* @return The text's past-the-end offset.
* @stable ICU 2.0
*/
virtual int32_t last() override;
/**
* Advances the iterator either forward or backward the specified number of steps.
* Negative values move backward, and positive values move forward. This is
* equivalent to repeatedly calling next() or previous().
* @param n The number of steps to move. The sign indicates the direction
* (negative is backwards, and positive is forwards).
* @return The character offset of the boundary position n boundaries away from
* the current one.
* @stable ICU 2.0
*/
virtual int32_t next(int32_t n) override;
/**
* Advances the iterator to the next boundary position.
* @return The position of the first boundary after this one.
* @stable ICU 2.0
*/
virtual int32_t next() override;
/**
* Moves the iterator backwards, to the last boundary preceding this one.
* @return The position of the last boundary position preceding this one.
* @stable ICU 2.0
*/
virtual int32_t previous() override;
/**
* Sets the iterator to refer to the first boundary position following
* the specified position.
* @param offset The position from which to begin searching for a break position.
* @return The position of the first break after the current position.
* @stable ICU 2.0
*/
virtual int32_t following(int32_t offset) override;
/**
* Sets the iterator to refer to the last boundary position before the
* specified position.
* @param offset The position to begin searching for a break from.
* @return The position of the last boundary before the starting position.
* @stable ICU 2.0
*/
virtual int32_t preceding(int32_t offset) override;
/**
* Returns true if the specified position is a boundary position. As a side
* effect, leaves the iterator pointing to the first boundary position at
* or after "offset".
* @param offset the offset to check.
* @return True if "offset" is a boundary position.
* @stable ICU 2.0
*/
virtual UBool isBoundary(int32_t offset) override;
/**
* Returns the current iteration position. Note that UBRK_DONE is never
* returned from this function; if iteration has run to the end of a
* string, current() will return the length of the string while
* next() will return UBRK_DONE).
* @return The current iteration position.
* @stable ICU 2.0
*/
virtual int32_t current() const override;
/**
* Return the status tag from the break rule that determined the boundary at
* the current iteration position. For break rules that do not specify a
* status, a default value of 0 is returned. If more than one break rule
* would cause a boundary to be located at some position in the text,
* the numerically largest of the applicable status values is returned.
* <p>
* Of the standard types of ICU break iterators, only word break and
* line break provide status values. The values are defined in
* the header file ubrk.h. For Word breaks, the status allows distinguishing between words
* that contain alphabetic letters, "words" that appear to be numbers,
* punctuation and spaces, words containing ideographic characters, and
* more. For Line Break, the status distinguishes between hard (mandatory) breaks
* and soft (potential) break positions.
* <p>
* <code>getRuleStatus()</code> can be called after obtaining a boundary
* position from <code>next()</code>, <code>previous()</code>, or
* any other break iterator functions that returns a boundary position.
* <p>
* Note that <code>getRuleStatus()</code> returns the value corresponding to
* <code>current()</code> index even after <code>next()</code> has returned DONE.
* <p>
* When creating custom break rules, one is free to define whatever
* status values may be convenient for the application.
* <p>
* @return the status from the break rule that determined the boundary
* at the current iteration position.
*
* @see UWordBreak
* @stable ICU 2.2
*/
virtual int32_t getRuleStatus() const override;
/**
* Get the status (tag) values from the break rule(s) that determined the boundary
* at the current iteration position.
* <p>
* The returned status value(s) are stored into an array provided by the caller.
* The values are stored in sorted (ascending) order.
* If the capacity of the output array is insufficient to hold the data,
* the output will be truncated to the available length, and a
* U_BUFFER_OVERFLOW_ERROR will be signaled.
*
* @param fillInVec an array to be filled in with the status values.
* @param capacity the length of the supplied vector. A length of zero causes
* the function to return the number of status values, in the
* normal way, without attempting to store any values.
* @param status receives error codes.
* @return The number of rule status values from the rules that determined
* the boundary at the current iteration position.
* In the event of a U_BUFFER_OVERFLOW_ERROR, the return value
* is the total number of status values that were available,
* not the reduced number that were actually returned.
* @see getRuleStatus
* @stable ICU 3.0
*/
virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) override;
/**
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
* This method is to implement a simple version of RTTI, since not all
* C++ compilers support genuine RTTI. Polymorphic operator==() and
* clone() methods call this method.
*
* @return The class ID for this object. All objects of a
* given class have the same class ID. Objects of
* other classes have different class IDs.
* @stable ICU 2.0
*/
virtual UClassID getDynamicClassID() const override;
/**
* Returns the class ID for this class. This is useful only for
* comparing to a return value from getDynamicClassID(). For example:
*
* Base* polymorphic_pointer = createPolymorphicObject();
* if (polymorphic_pointer->getDynamicClassID() ==
* Derived::getStaticClassID()) ...
*
* @return The class ID for all objects of this class.
* @stable ICU 2.0
*/
static UClassID U_EXPORT2 getStaticClassID();
#ifndef U_FORCE_HIDE_DEPRECATED_API
/**
* Deprecated functionality. Use clone() instead.
*
* Create a clone (copy) of this break iterator in memory provided
* by the caller. The idea is to increase performance by avoiding
* a storage allocation. Use of this function is NOT RECOMMENDED.
* Performance gains are minimal, and correct buffer management is
* tricky. Use clone() instead.
*
* @param stackBuffer The pointer to the memory into which the cloned object
* should be placed. If nullptr, allocate heap memory
* for the cloned object.
* @param BufferSize The size of the buffer. If zero, return the required
* buffer size, but do not clone the object. If the
* size was too small (but not zero), allocate heap
* storage for the cloned object.
*
* @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be
* returned if the provided buffer was too small, and
* the clone was therefore put on the heap.
*
* @return Pointer to the clone object. This may differ from the stackBuffer
* address if the byte alignment of the stack buffer was not suitable
* or if the stackBuffer was too small to hold the clone.
* @deprecated ICU 52. Use clone() instead.
*/
virtual RuleBasedBreakIterator *createBufferClone(void *stackBuffer,
int32_t &BufferSize,
UErrorCode &status) override;
#endif // U_FORCE_HIDE_DEPRECATED_API
/**
* Return the binary form of compiled break rules,
* which can then be used to create a new break iterator at some
* time in the future. Creating a break iterator from pre-compiled rules
* is much faster than building one from the source form of the
* break rules.
*
* The binary data can only be used with the same version of ICU
* and on the same platform type (processor endian-ness)
*
* @param length Returns the length of the binary data. (Out parameter.)
*
* @return A pointer to the binary (compiled) rule data. The storage
* belongs to the RulesBasedBreakIterator object, not the
* caller, and must not be modified or deleted.
* @stable ICU 4.8
*/
virtual const uint8_t *getBinaryRules(uint32_t &length);
/**
* Set the subject text string upon which the break iterator is operating
* without changing any other aspect of the matching state.
* The new and previous text strings must have the same content.
*
* This function is intended for use in environments where ICU is operating on
* strings that may move around in memory. It provides a mechanism for notifying
* ICU that the string has been relocated, and providing a new UText to access the
* string in its new position.
*
* Note that the break iterator implementation never copies the underlying text
* of a string being processed, but always operates directly on the original text
* provided by the user. Refreshing simply drops the references to the old text
* and replaces them with references to the new.
*
* Caution: this function is normally used only by very specialized,
* system-level code. One example use case is with garbage collection that moves
* the text in memory.
*
* @param input The new (moved) text string.
* @param status Receives errors detected by this function.
* @return *this
*
* @stable ICU 49
*/
virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status) override;
private:
//=======================================================================
// implementation
//=======================================================================
/**
* Iterate backwards from an arbitrary position in the input text using the
* synthesized Safe Reverse rules.
* This locates a "Safe Position" from which the forward break rules
* will operate correctly. A Safe Position is not necessarily a boundary itself.
*
* @param fromPosition the position in the input text to begin the iteration.
* @internal (private)
*/
int32_t handleSafePrevious(int32_t fromPosition);
/**
* Find a rule-based boundary by running the state machine.
* Input
* fPosition, the position in the text to begin from.
* Output
* fPosition: the boundary following the starting position.
* fDictionaryCharCount the number of dictionary characters encountered.
* If > 0, the segment will be further subdivided
* fRuleStatusIndex Info from the state table indicating which rules caused the boundary.
*
* @internal (private)
*/
int32_t handleNext();
/*
* Templatized version of handleNext() and handleSafePrevious().
*
* There will be exactly four instantiations, two each for 8 and 16 bit tables,
* two each for 8 and 16 bit trie.
* Having separate instantiations for the table types keeps conditional tests of
* the table type out of the inner loops, at the expense of replicated code.
*
* The template parameter for the Trie access function is a value, not a type.
* Doing it this way, the compiler will inline the Trie function in the
* expanded functions. (Both the 8 and 16 bit access functions have the same type
* signature)
*/
typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32);
template<typename RowType, PTrieFunc trieFunc>
int32_t handleSafePrevious(int32_t fromPosition);
template<typename RowType, PTrieFunc trieFunc>
int32_t handleNext();
/**
* This function returns the appropriate LanguageBreakEngine for a
* given character c.
* @param c A character in the dictionary set
* @param locale The locale.
* @internal (private)
*/
const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c, const char* locale);
public:
#ifndef U_HIDE_INTERNAL_API
/**
* Debugging function only.
* @internal
*/
void dumpCache();
/**
* Debugging function only.
* @internal
*/
void dumpTables();
#endif /* U_HIDE_INTERNAL_API */
#ifndef U_HIDE_INTERNAL_API
/**
* Register a new external break engine. The external break engine will be adopted.
* Because ICU may choose to cache break engine internally, this must
* be called at application startup, prior to any calls to
* object methods of RuleBasedBreakIterator to avoid undefined behavior.
* @param toAdopt the ExternalBreakEngine instance to be adopted
* @param status the in/out status code, no special meanings are assigned
* @internal ICU 74 technology preview
*/
static void U_EXPORT2 registerExternalBreakEngine(
ExternalBreakEngine* toAdopt, UErrorCode& status);
#endif /* U_HIDE_INTERNAL_API */
};
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif /* U_SHOW_CPLUSPLUS_API */
#endif

266
thirdparty/icu4c/common/unicode/rep.h vendored Normal file
View File

@@ -0,0 +1,266 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**************************************************************************
* Copyright (C) 1999-2012, International Business Machines Corporation and
* others. All Rights Reserved.
**************************************************************************
* Date Name Description
* 11/17/99 aliu Creation. Ported from java. Modified to
* match current UnicodeString API. Forced
* to use name "handleReplaceBetween" because
* of existing methods in UnicodeString.
**************************************************************************
*/
#ifndef REP_H
#define REP_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/uobject.h"
/**
* \file
* \brief C++ API: Replaceable String
*/
U_NAMESPACE_BEGIN
class UnicodeString;
/**
* <code>Replaceable</code> is an abstract base class representing a
* string of characters that supports the replacement of a range of
* itself with a new string of characters. It is used by APIs that
* change a piece of text while retaining metadata. Metadata is data
* other than the Unicode characters returned by char32At(). One
* example of metadata is style attributes; another is an edit
* history, marking each character with an author and revision number.
*
* <p>An implicit aspect of the <code>Replaceable</code> API is that
* during a replace operation, new characters take on the metadata of
* the old characters. For example, if the string "the <b>bold</b>
* font" has range (4, 8) replaced with "strong", then it becomes "the
* <b>strong</b> font".
*
* <p><code>Replaceable</code> specifies ranges using a start
* offset and a limit offset. The range of characters thus specified
* includes the characters at offset start..limit-1. That is, the
* start offset is inclusive, and the limit offset is exclusive.
*
* <p><code>Replaceable</code> also includes API to access characters
* in the string: <code>length()</code>, <code>charAt()</code>,
* <code>char32At()</code>, and <code>extractBetween()</code>.
*
* <p>For a subclass to support metadata, typical behavior of
* <code>replace()</code> is the following:
* <ul>
* <li>Set the metadata of the new text to the metadata of the first
* character replaced</li>
* <li>If no characters are replaced, use the metadata of the
* previous character</li>
* <li>If there is no previous character (i.e. start == 0), use the
* following character</li>
* <li>If there is no following character (i.e. the replaceable was
* empty), use default metadata.<br>
* <li>If the code point U+FFFF is seen, it should be interpreted as
* a special marker having no metadata<li>
* </li>
* </ul>
* If this is not the behavior, the subclass should document any differences.
* @author Alan Liu
* @stable ICU 2.0
*/
class U_COMMON_API Replaceable : public UObject {
public:
/**
* Destructor.
* @stable ICU 2.0
*/
virtual ~Replaceable();
/**
* Returns the number of 16-bit code units in the text.
* @return number of 16-bit code units in text
* @stable ICU 1.8
*/
inline int32_t length() const;
/**
* Returns the 16-bit code unit at the given offset into the text.
* @param offset an integer between 0 and <code>length()</code>-1
* inclusive
* @return 16-bit code unit of text at given offset
* @stable ICU 1.8
*/
inline char16_t charAt(int32_t offset) const;
/**
* Returns the 32-bit code point at the given 16-bit offset into
* the text. This assumes the text is stored as 16-bit code units
* with surrogate pairs intermixed. If the offset of a leading or
* trailing code unit of a surrogate pair is given, return the
* code point of the surrogate pair.
*
* @param offset an integer between 0 and <code>length()</code>-1
* inclusive
* @return 32-bit code point of text at given offset
* @stable ICU 1.8
*/
inline UChar32 char32At(int32_t offset) const;
/**
* Copies characters in the range [<tt>start</tt>, <tt>limit</tt>)
* into the UnicodeString <tt>target</tt>.
* @param start offset of first character which will be copied
* @param limit offset immediately following the last character to
* be copied
* @param target UnicodeString into which to copy characters.
* @return A reference to <TT>target</TT>
* @stable ICU 2.1
*/
virtual void extractBetween(int32_t start,
int32_t limit,
UnicodeString& target) const = 0;
/**
* Replaces a substring of this object with the given text. If the
* characters being replaced have metadata, the new characters
* that replace them should be given the same metadata.
*
* <p>Subclasses must ensure that if the text between start and
* limit is equal to the replacement text, that replace has no
* effect. That is, any metadata
* should be unaffected. In addition, subclasses are encouraged to
* check for initial and trailing identical characters, and make a
* smaller replacement if possible. This will preserve as much
* metadata as possible.
* @param start the beginning index, inclusive; <code>0 <= start
* <= limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit
* <= length()</code>.
* @param text the text to replace characters <code>start</code>
* to <code>limit - 1</code>
* @stable ICU 2.0
*/
virtual void handleReplaceBetween(int32_t start,
int32_t limit,
const UnicodeString& text) = 0;
// Note: All other methods in this class take the names of
// existing UnicodeString methods. This method is the exception.
// It is named differently because all replace methods of
// UnicodeString return a UnicodeString&. The 'between' is
// required in order to conform to the UnicodeString naming
// convention; API taking start/length are named <operation>, and
// those taking start/limit are named <operationBetween>. The
// 'handle' is added because 'replaceBetween' and
// 'doReplaceBetween' are already taken.
/**
* Copies a substring of this object, retaining metadata.
* This method is used to duplicate or reorder substrings.
* The destination index must not overlap the source range.
*
* @param start the beginning index, inclusive; <code>0 <= start <=
* limit</code>.
* @param limit the ending index, exclusive; <code>start <= limit <=
* length()</code>.
* @param dest the destination index. The characters from
* <code>start..limit-1</code> will be copied to <code>dest</code>.
* Implementations of this method may assume that <code>dest <= start ||
* dest >= limit</code>.
* @stable ICU 2.0
*/
virtual void copy(int32_t start, int32_t limit, int32_t dest) = 0;
/**
* Returns true if this object contains metadata. If a
* Replaceable object has metadata, calls to the Replaceable API
* must be made so as to preserve metadata. If it does not, calls
* to the Replaceable API may be optimized to improve performance.
* The default implementation returns true.
* @return true if this object contains metadata
* @stable ICU 2.2
*/
virtual UBool hasMetaData() const;
/**
* Clone this object, an instance of a subclass of Replaceable.
* Clones can be used concurrently in multiple threads.
* If a subclass does not implement clone(), or if an error occurs,
* then nullptr is returned.
* The caller must delete the clone.
*
* @return a clone of this object
*
* @see getDynamicClassID
* @stable ICU 2.6
*/
virtual Replaceable *clone() const;
protected:
/**
* Default constructor.
* @stable ICU 2.4
*/
inline Replaceable();
/*
* Assignment operator not declared. The compiler will provide one
* which does nothing since this class does not contain any data members.
* API/code coverage may show the assignment operator as present and
* untested - ignore.
* Subclasses need this assignment operator if they use compiler-provided
* assignment operators of their own. An alternative to not declaring one
* here would be to declare and empty-implement a protected or public one.
Replaceable &Replaceable::operator=(const Replaceable &);
*/
/**
* Virtual version of length().
* @stable ICU 2.4
*/
virtual int32_t getLength() const = 0;
/**
* Virtual version of charAt().
* @stable ICU 2.4
*/
virtual char16_t getCharAt(int32_t offset) const = 0;
/**
* Virtual version of char32At().
* @stable ICU 2.4
*/
virtual UChar32 getChar32At(int32_t offset) const = 0;
};
inline Replaceable::Replaceable() {}
inline int32_t
Replaceable::length() const {
return getLength();
}
inline char16_t
Replaceable::charAt(int32_t offset) const {
return getCharAt(offset);
}
inline UChar32
Replaceable::char32At(int32_t offset) const {
return getChar32At(offset);
}
// There is no rep.cpp, see unistr.cpp for Replaceable function implementations.
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif

View File

@@ -0,0 +1,489 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1996-2013, International Business Machines Corporation
* and others. All Rights Reserved.
*
******************************************************************************
*
* File resbund.h
*
* CREATED BY
* Richard Gillam
*
* Modification History:
*
* Date Name Description
* 2/5/97 aliu Added scanForLocaleInFile. Added
* constructor which attempts to read resource bundle
* from a specific file, without searching other files.
* 2/11/97 aliu Added UErrorCode return values to constructors. Fixed
* infinite loops in scanForFile and scanForLocale.
* Modified getRawResourceData to not delete storage
* in localeData and resourceData which it doesn't own.
* Added Mac compatibility #ifdefs for tellp() and
* ios::nocreate.
* 2/18/97 helena Updated with 100% documentation coverage.
* 3/13/97 aliu Rewrote to load in entire resource bundle and store
* it as a Hashtable of ResourceBundleData objects.
* Added state table to govern parsing of files.
* Modified to load locale index out of new file
* distinct from default.txt.
* 3/25/97 aliu Modified to support 2-d arrays, needed for timezone
* data. Added support for custom file suffixes. Again,
* needed to support timezone data.
* 4/7/97 aliu Cleaned up.
* 03/02/99 stephen Removed dependency on FILE*.
* 03/29/99 helena Merged Bertrand and Stephen's changes.
* 06/11/99 stephen Removed parsing of .txt files.
* Reworked to use new binary format.
* Cleaned up.
* 06/14/99 stephen Removed methods taking a filename suffix.
* 11/09/99 weiv Added getLocale(), fRealLocale, removed fRealLocaleID
******************************************************************************
*/
#ifndef RESBUND_H
#define RESBUND_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/uobject.h"
#include "unicode/ures.h"
#include "unicode/unistr.h"
#include "unicode/locid.h"
/**
* \file
* \brief C++ API: Resource Bundle
*/
U_NAMESPACE_BEGIN
/**
* A class representing a collection of resource information pertaining to a given
* locale. A resource bundle provides a way of accessing locale- specific information in
* a data file. You create a resource bundle that manages the resources for a given
* locale and then ask it for individual resources.
* <P>
* Resource bundles in ICU4C are currently defined using text files which conform to the following
* <a href="https://github.com/unicode-org/icu-docs/blob/main/design/bnf_rb.txt">BNF definition</a>.
* More on resource bundle concepts and syntax can be found in the
* <a href="https://unicode-org.github.io/icu/userguide/locale/resources">Users Guide</a>.
* <P>
*
* The ResourceBundle class is not suitable for subclassing.
*
* @stable ICU 2.0
*/
class U_COMMON_API ResourceBundle : public UObject {
public:
/**
* Constructor
*
* @param packageName The packageName and locale together point to an ICU udata object,
* as defined by <code> udata_open( packageName, "res", locale, err) </code>
* or equivalent. Typically, packageName will refer to a (.dat) file, or to
* a package registered with udata_setAppData(). Using a full file or directory
* pathname for packageName is deprecated.
* @param locale This is the locale this resource bundle is for. To get resources
* for the French locale, for example, you would create a
* ResourceBundle passing Locale::FRENCH for the "locale" parameter,
* and all subsequent calls to that resource bundle will return
* resources that pertain to the French locale. If the caller doesn't
* pass a locale parameter, the default locale for the system (as
* returned by Locale::getDefault()) will be used.
* @param err The Error Code.
* The UErrorCode& err parameter is used to return status information to the user. To
* check whether the construction succeeded or not, you should check the value of
* U_SUCCESS(err). If you wish more detailed information, you can check for
* informational error results which still indicate success. U_USING_FALLBACK_WARNING
* indicates that a fall back locale was used. For example, 'de_CH' was requested,
* but nothing was found there, so 'de' was used. U_USING_DEFAULT_WARNING indicates that
* the default locale data was used; neither the requested locale nor any of its
* fall back locales could be found.
* @stable ICU 2.0
*/
ResourceBundle(const UnicodeString& packageName,
const Locale& locale,
UErrorCode& err);
/**
* Construct a resource bundle for the default bundle in the specified package.
*
* @param packageName The packageName and locale together point to an ICU udata object,
* as defined by <code> udata_open( packageName, "res", locale, err) </code>
* or equivalent. Typically, packageName will refer to a (.dat) file, or to
* a package registered with udata_setAppData(). Using a full file or directory
* pathname for packageName is deprecated.
* @param err A UErrorCode value
* @stable ICU 2.0
*/
ResourceBundle(const UnicodeString& packageName,
UErrorCode& err);
/**
* Construct a resource bundle for the ICU default bundle.
*
* @param err A UErrorCode value
* @stable ICU 2.0
*/
ResourceBundle(UErrorCode &err);
/**
* Standard constructor, constructs a resource bundle for the locale-specific
* bundle in the specified package.
*
* @param packageName The packageName and locale together point to an ICU udata object,
* as defined by <code> udata_open( packageName, "res", locale, err) </code>
* or equivalent. Typically, packageName will refer to a (.dat) file, or to
* a package registered with udata_setAppData(). Using a full file or directory
* pathname for packageName is deprecated.
* nullptr is used to refer to ICU data.
* @param locale The locale for which to open a resource bundle.
* @param err A UErrorCode value
* @stable ICU 2.0
*/
ResourceBundle(const char* packageName,
const Locale& locale,
UErrorCode& err);
/**
* Copy constructor.
*
* @param original The resource bundle to copy.
* @stable ICU 2.0
*/
ResourceBundle(const ResourceBundle &original);
/**
* Constructor from a C UResourceBundle. The resource bundle is
* copied and not adopted. ures_close will still need to be used on the
* original resource bundle.
*
* @param res A pointer to the C resource bundle.
* @param status A UErrorCode value.
* @stable ICU 2.0
*/
ResourceBundle(UResourceBundle *res,
UErrorCode &status);
/**
* Assignment operator.
*
* @param other The resource bundle to copy.
* @stable ICU 2.0
*/
ResourceBundle&
operator=(const ResourceBundle& other);
/** Destructor.
* @stable ICU 2.0
*/
virtual ~ResourceBundle();
/**
* Clone this object.
* Clones can be used concurrently in multiple threads.
* If an error occurs, then nullptr is returned.
* The caller must delete the clone.
*
* @return a clone of this object
*
* @see getDynamicClassID
* @stable ICU 2.8
*/
ResourceBundle *clone() const;
/**
* Returns the size of a resource. Size for scalar types is always 1, and for vector/table types is
* the number of child resources.
* @warning Integer array is treated as a scalar type. There are no
* APIs to access individual members of an integer array. It
* is always returned as a whole.
*
* @return number of resources in a given resource.
* @stable ICU 2.0
*/
int32_t getSize() const;
/**
* returns a string from a string resource type
*
* @param status fills in the outgoing error code
* could be <TT>U_MISSING_RESOURCE_ERROR</TT> if the key is not found
* could be a warning
* e.g.: <TT>U_USING_FALLBACK_WARNING</TT>,<TT>U_USING_DEFAULT_WARNING </TT>
* @return a pointer to a zero-terminated char16_t array which lives in a memory mapped/DLL file.
* @stable ICU 2.0
*/
UnicodeString
getString(UErrorCode& status) const;
/**
* returns a binary data from a resource. Can be used at most primitive resource types (binaries,
* strings, ints)
*
* @param len fills in the length of resulting byte chunk
* @param status fills in the outgoing error code
* could be <TT>U_MISSING_RESOURCE_ERROR</TT> if the key is not found
* could be a warning
* e.g.: <TT>U_USING_FALLBACK_WARNING</TT>,<TT>U_USING_DEFAULT_WARNING </TT>
* @return a pointer to a chunk of unsigned bytes which live in a memory mapped/DLL file.
* @stable ICU 2.0
*/
const uint8_t*
getBinary(int32_t& len, UErrorCode& status) const;
/**
* returns an integer vector from a resource.
*
* @param len fills in the length of resulting integer vector
* @param status fills in the outgoing error code
* could be <TT>U_MISSING_RESOURCE_ERROR</TT> if the key is not found
* could be a warning
* e.g.: <TT>U_USING_FALLBACK_WARNING</TT>,<TT>U_USING_DEFAULT_WARNING </TT>
* @return a pointer to a vector of integers that lives in a memory mapped/DLL file.
* @stable ICU 2.0
*/
const int32_t*
getIntVector(int32_t& len, UErrorCode& status) const;
/**
* returns an unsigned integer from a resource.
* This integer is originally 28 bits.
*
* @param status fills in the outgoing error code
* could be <TT>U_MISSING_RESOURCE_ERROR</TT> if the key is not found
* could be a warning
* e.g.: <TT>U_USING_FALLBACK_WARNING</TT>,<TT>U_USING_DEFAULT_WARNING </TT>
* @return an unsigned integer value
* @stable ICU 2.0
*/
uint32_t
getUInt(UErrorCode& status) const;
/**
* returns a signed integer from a resource.
* This integer is originally 28 bit and the sign gets propagated.
*
* @param status fills in the outgoing error code
* could be <TT>U_MISSING_RESOURCE_ERROR</TT> if the key is not found
* could be a warning
* e.g.: <TT>U_USING_FALLBACK_WARNING</TT>,<TT>U_USING_DEFAULT_WARNING </TT>
* @return a signed integer value
* @stable ICU 2.0
*/
int32_t
getInt(UErrorCode& status) const;
/**
* Checks whether the resource has another element to iterate over.
*
* @return true if there are more elements, false if there is no more elements
* @stable ICU 2.0
*/
UBool hasNext() const;
/**
* Resets the internal context of a resource so that iteration starts from the first element.
*
* @stable ICU 2.0
*/
void resetIterator();
/**
* Returns the key associated with this resource. Not all the resources have a key - only
* those that are members of a table.
*
* @return a key associated to this resource, or nullptr if it doesn't have a key
* @stable ICU 2.0
*/
const char* getKey() const;
/**
* Gets the locale ID of the resource bundle as a string.
* Same as getLocale().getName() .
*
* @return the locale ID of the resource bundle as a string
* @stable ICU 2.0
*/
const char* getName() const;
/**
* Returns the type of a resource. Available types are defined in enum UResType
*
* @return type of the given resource.
* @stable ICU 2.0
*/
UResType getType() const;
/**
* Returns the next resource in a given resource or nullptr if there are no more resources
*
* @param status fills in the outgoing error code
* @return ResourceBundle object.
* @stable ICU 2.0
*/
ResourceBundle
getNext(UErrorCode& status);
/**
* Returns the next string in a resource or nullptr if there are no more resources
* to iterate over.
*
* @param status fills in the outgoing error code
* @return an UnicodeString object.
* @stable ICU 2.0
*/
UnicodeString
getNextString(UErrorCode& status);
/**
* Returns the next string in a resource or nullptr if there are no more resources
* to iterate over.
*
* @param key fill in for key associated with this string
* @param status fills in the outgoing error code
* @return an UnicodeString object.
* @stable ICU 2.0
*/
UnicodeString
getNextString(const char ** key,
UErrorCode& status);
/**
* Returns the resource in a resource at the specified index.
*
* @param index an index to the wanted resource.
* @param status fills in the outgoing error code
* @return ResourceBundle object. If there is an error, resource is invalid.
* @stable ICU 2.0
*/
ResourceBundle
get(int32_t index,
UErrorCode& status) const;
/**
* Returns the string in a given resource at the specified index.
*
* @param index an index to the wanted string.
* @param status fills in the outgoing error code
* @return an UnicodeString object. If there is an error, string is bogus
* @stable ICU 2.0
*/
UnicodeString
getStringEx(int32_t index,
UErrorCode& status) const;
/**
* Returns a resource in a resource that has a given key. This procedure works only with table
* resources.
*
* @param key a key associated with the wanted resource
* @param status fills in the outgoing error code.
* @return ResourceBundle object. If there is an error, resource is invalid.
* @stable ICU 2.0
*/
ResourceBundle
get(const char* key,
UErrorCode& status) const;
/**
* Returns a string in a resource that has a given key. This procedure works only with table
* resources.
*
* @param key a key associated with the wanted string
* @param status fills in the outgoing error code
* @return an UnicodeString object. If there is an error, string is bogus
* @stable ICU 2.0
*/
UnicodeString
getStringEx(const char* key,
UErrorCode& status) const;
#ifndef U_HIDE_DEPRECATED_API
/**
* Return the version number associated with this ResourceBundle as a string. Please
* use getVersion, as this method is going to be deprecated.
*
* @return A version number string as specified in the resource bundle or its parent.
* The caller does not own this string.
* @see getVersion
* @deprecated ICU 2.8 Use getVersion instead.
*/
const char* getVersionNumber() const;
#endif /* U_HIDE_DEPRECATED_API */
/**
* Return the version number associated with this ResourceBundle as a UVersionInfo array.
*
* @param versionInfo A UVersionInfo array that is filled with the version number
* as specified in the resource bundle or its parent.
* @stable ICU 2.0
*/
void
getVersion(UVersionInfo versionInfo) const;
#ifndef U_HIDE_DEPRECATED_API
/**
* Return the Locale associated with this ResourceBundle.
*
* @return a Locale object
* @deprecated ICU 2.8 Use getLocale(ULocDataLocaleType type, UErrorCode &status) overload instead.
*/
const Locale& getLocale() const;
#endif /* U_HIDE_DEPRECATED_API */
/**
* Return the Locale associated with this ResourceBundle.
* @param type You can choose between requested, valid and actual
* locale. For description see the definition of
* ULocDataLocaleType in uloc.h
* @param status just for catching illegal arguments
*
* @return a Locale object
* @stable ICU 2.8
*/
Locale
getLocale(ULocDataLocaleType type, UErrorCode &status) const;
#ifndef U_HIDE_INTERNAL_API
/**
* This API implements multilevel fallback
* @internal
*/
ResourceBundle
getWithFallback(const char* key, UErrorCode& status);
#endif /* U_HIDE_INTERNAL_API */
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*
* @stable ICU 2.2
*/
virtual UClassID getDynamicClassID() const override;
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*
* @stable ICU 2.2
*/
static UClassID U_EXPORT2 getStaticClassID();
private:
ResourceBundle() = delete; // default constructor not implemented
UResourceBundle *fResource;
void constructForLocale(const UnicodeString& path, const Locale& locale, UErrorCode& error);
Locale *fLocale;
};
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif

View File

@@ -0,0 +1,187 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1998-2005, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*
* File schriter.h
*
* Modification History:
*
* Date Name Description
* 05/05/99 stephen Cleaned up.
******************************************************************************
*/
#ifndef SCHRITER_H
#define SCHRITER_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/chariter.h"
#include "unicode/uchriter.h"
/**
* \file
* \brief C++ API: String Character Iterator
*/
U_NAMESPACE_BEGIN
/**
* A concrete subclass of CharacterIterator that iterates over the
* characters (code units or code points) in a UnicodeString.
* It's possible not only to create an
* iterator that iterates over an entire UnicodeString, but also to
* create one that iterates over only a subrange of a UnicodeString
* (iterators over different subranges of the same UnicodeString don't
* compare equal).
* @see CharacterIterator
* @see ForwardCharacterIterator
* @stable ICU 2.0
*/
class U_COMMON_API StringCharacterIterator : public UCharCharacterIterator {
public:
/**
* Create an iterator over the UnicodeString referred to by "textStr".
* The UnicodeString object is copied.
* The iteration range is the whole string, and the starting position is 0.
* @param textStr The unicode string used to create an iterator
* @stable ICU 2.0
*/
StringCharacterIterator(const UnicodeString& textStr);
/**
* Create an iterator over the UnicodeString referred to by "textStr".
* The iteration range is the whole string, and the starting
* position is specified by "textPos". If "textPos" is outside the valid
* iteration range, the behavior of this object is undefined.
* @param textStr The unicode string used to create an iterator
* @param textPos The starting position of the iteration
* @stable ICU 2.0
*/
StringCharacterIterator(const UnicodeString& textStr,
int32_t textPos);
/**
* Create an iterator over the UnicodeString referred to by "textStr".
* The UnicodeString object is copied.
* The iteration range begins with the code unit specified by
* "textBegin" and ends with the code unit BEFORE the code unit specified
* by "textEnd". The starting position is specified by "textPos". If
* "textBegin" and "textEnd" don't form a valid range on "text" (i.e.,
* textBegin >= textEnd or either is negative or greater than text.size()),
* or "textPos" is outside the range defined by "textBegin" and "textEnd",
* the behavior of this iterator is undefined.
* @param textStr The unicode string used to create the StringCharacterIterator
* @param textBegin The begin position of the iteration range
* @param textEnd The end position of the iteration range
* @param textPos The starting position of the iteration
* @stable ICU 2.0
*/
StringCharacterIterator(const UnicodeString& textStr,
int32_t textBegin,
int32_t textEnd,
int32_t textPos);
/**
* Copy constructor. The new iterator iterates over the same range
* of the same string as "that", and its initial position is the
* same as "that"'s current position.
* The UnicodeString object in "that" is copied.
* @param that The StringCharacterIterator to be copied
* @stable ICU 2.0
*/
StringCharacterIterator(const StringCharacterIterator& that);
/**
* Destructor.
* @stable ICU 2.0
*/
virtual ~StringCharacterIterator();
/**
* Assignment operator. *this is altered to iterate over the same
* range of the same string as "that", and refers to the same
* character within that string as "that" does.
* @param that The object to be copied.
* @return the newly created object.
* @stable ICU 2.0
*/
StringCharacterIterator&
operator=(const StringCharacterIterator& that);
/**
* Returns true if the iterators iterate over the same range of the
* same string and are pointing at the same character.
* @param that The ForwardCharacterIterator to be compared for equality
* @return true if the iterators iterate over the same range of the
* same string and are pointing at the same character.
* @stable ICU 2.0
*/
virtual bool operator==(const ForwardCharacterIterator& that) const override;
/**
* Returns a new StringCharacterIterator referring to the same
* character in the same range of the same string as this one. The
* caller must delete the new iterator.
* @return the newly cloned object.
* @stable ICU 2.0
*/
virtual StringCharacterIterator* clone() const override;
/**
* Sets the iterator to iterate over the provided string.
* @param newText The string to be iterated over
* @stable ICU 2.0
*/
void setText(const UnicodeString& newText);
/**
* Copies the UnicodeString under iteration into the UnicodeString
* referred to by "result". Even if this iterator iterates across
* only a part of this string, the whole string is copied.
* @param result Receives a copy of the text under iteration.
* @stable ICU 2.0
*/
virtual void getText(UnicodeString& result) override;
/**
* Return a class ID for this object (not really public)
* @return a class ID for this object.
* @stable ICU 2.0
*/
virtual UClassID getDynamicClassID() const override;
/**
* Return a class ID for this class (not really public)
* @return a class ID for this class
* @stable ICU 2.0
*/
static UClassID U_EXPORT2 getStaticClassID();
protected:
/**
* Default constructor, iteration over empty string.
* @stable ICU 2.0
*/
StringCharacterIterator();
/**
* Copy of the iterated string object.
* @stable ICU 2.0
*/
UnicodeString text;
};
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif

View File

@@ -0,0 +1,339 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
* Copyright (C) 2014-2016, International Business Machines
* Corporation and others. All Rights Reserved.
******************************************************************************
* simpleformatter.h
*/
#ifndef __SIMPLEFORMATTER_H__
#define __SIMPLEFORMATTER_H__
/**
* \file
* \brief C++ API: Simple formatter, minimal subset of MessageFormat.
*/
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/unistr.h"
U_NAMESPACE_BEGIN
// Forward declaration:
namespace number::impl {
class SimpleModifier;
}
/**
* Formats simple patterns like "{1} was born in {0}".
* Minimal subset of MessageFormat; fast, simple, minimal dependencies.
* Supports only numbered arguments with no type nor style parameters,
* and formats only string values.
* Quoting via ASCII apostrophe compatible with ICU MessageFormat default behavior.
*
* Factory methods set error codes for syntax errors
* and for too few or too many arguments/placeholders.
*
* SimpleFormatter objects are thread-safe except for assignment and applying new patterns.
*
* Example:
* <pre>
* UErrorCode errorCode = U_ZERO_ERROR;
* SimpleFormatter fmt("{1} '{born}' in {0}", errorCode);
* UnicodeString result;
*
* // Output: "paul {born} in england"
* fmt.format("england", "paul", result, errorCode);
* </pre>
*
* This class is not intended for public subclassing.
*
* @see MessageFormat
* @see UMessagePatternApostropheMode
* @stable ICU 57
*/
class U_COMMON_API SimpleFormatter final : public UMemory {
public:
/**
* Default constructor.
* @stable ICU 57
*/
SimpleFormatter() : compiledPattern(static_cast<char16_t>(0)) {}
/**
* Constructs a formatter from the pattern string.
*
* @param pattern The pattern string.
* @param errorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* Set to U_ILLEGAL_ARGUMENT_ERROR for bad argument syntax.
* @stable ICU 57
*/
SimpleFormatter(const UnicodeString& pattern, UErrorCode &errorCode) {
applyPattern(pattern, errorCode);
}
/**
* Constructs a formatter from the pattern string.
* The number of arguments checked against the given limits is the
* highest argument number plus one, not the number of occurrences of arguments.
*
* @param pattern The pattern string.
* @param min The pattern must have at least this many arguments.
* @param max The pattern must have at most this many arguments.
* @param errorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* Set to U_ILLEGAL_ARGUMENT_ERROR for bad argument syntax and
* too few or too many arguments.
* @stable ICU 57
*/
SimpleFormatter(const UnicodeString& pattern, int32_t min, int32_t max,
UErrorCode &errorCode) {
applyPatternMinMaxArguments(pattern, min, max, errorCode);
}
/**
* Copy constructor.
* @stable ICU 57
*/
SimpleFormatter(const SimpleFormatter& other)
: compiledPattern(other.compiledPattern) {}
/**
* Assignment operator.
* @stable ICU 57
*/
SimpleFormatter &operator=(const SimpleFormatter& other);
/**
* Destructor.
* @stable ICU 57
*/
~SimpleFormatter();
/**
* Changes this object according to the new pattern.
*
* @param pattern The pattern string.
* @param errorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* Set to U_ILLEGAL_ARGUMENT_ERROR for bad argument syntax.
* @return true if U_SUCCESS(errorCode).
* @stable ICU 57
*/
UBool applyPattern(const UnicodeString &pattern, UErrorCode &errorCode) {
return applyPatternMinMaxArguments(pattern, 0, INT32_MAX, errorCode);
}
/**
* Changes this object according to the new pattern.
* The number of arguments checked against the given limits is the
* highest argument number plus one, not the number of occurrences of arguments.
*
* @param pattern The pattern string.
* @param min The pattern must have at least this many arguments.
* @param max The pattern must have at most this many arguments.
* @param errorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* Set to U_ILLEGAL_ARGUMENT_ERROR for bad argument syntax and
* too few or too many arguments.
* @return true if U_SUCCESS(errorCode).
* @stable ICU 57
*/
UBool applyPatternMinMaxArguments(const UnicodeString &pattern,
int32_t min, int32_t max, UErrorCode &errorCode);
/**
* @return The max argument number + 1.
* @stable ICU 57
*/
int32_t getArgumentLimit() const {
return getArgumentLimit(compiledPattern.getBuffer(), compiledPattern.length());
}
/**
* Formats the given value, appending to the appendTo builder.
* The argument value must not be the same object as appendTo.
* getArgumentLimit() must be at most 1.
*
* @param value0 Value for argument {0}.
* @param appendTo Gets the formatted pattern and value appended.
* @param errorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return appendTo
* @stable ICU 57
*/
UnicodeString &format(
const UnicodeString &value0,
UnicodeString &appendTo, UErrorCode &errorCode) const;
/**
* Formats the given values, appending to the appendTo builder.
* An argument value must not be the same object as appendTo.
* getArgumentLimit() must be at most 2.
*
* @param value0 Value for argument {0}.
* @param value1 Value for argument {1}.
* @param appendTo Gets the formatted pattern and values appended.
* @param errorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return appendTo
* @stable ICU 57
*/
UnicodeString &format(
const UnicodeString &value0,
const UnicodeString &value1,
UnicodeString &appendTo, UErrorCode &errorCode) const;
/**
* Formats the given values, appending to the appendTo builder.
* An argument value must not be the same object as appendTo.
* getArgumentLimit() must be at most 3.
*
* @param value0 Value for argument {0}.
* @param value1 Value for argument {1}.
* @param value2 Value for argument {2}.
* @param appendTo Gets the formatted pattern and values appended.
* @param errorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return appendTo
* @stable ICU 57
*/
UnicodeString &format(
const UnicodeString &value0,
const UnicodeString &value1,
const UnicodeString &value2,
UnicodeString &appendTo, UErrorCode &errorCode) const;
/**
* Formats the given values, appending to the appendTo string.
*
* @param values The argument values.
* An argument value must not be the same object as appendTo.
* Can be nullptr if valuesLength==getArgumentLimit()==0.
* @param valuesLength The length of the values array.
* Must be at least getArgumentLimit().
* @param appendTo Gets the formatted pattern and values appended.
* @param offsets offsets[i] receives the offset of where
* values[i] replaced pattern argument {i}.
* Can be shorter or longer than values. Can be nullptr if offsetsLength==0.
* If there is no {i} in the pattern, then offsets[i] is set to -1.
* @param offsetsLength The length of the offsets array.
* @param errorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return appendTo
* @stable ICU 57
*/
UnicodeString &formatAndAppend(
const UnicodeString *const *values, int32_t valuesLength,
UnicodeString &appendTo,
int32_t *offsets, int32_t offsetsLength, UErrorCode &errorCode) const;
/**
* Formats the given values, replacing the contents of the result string.
* May optimize by actually appending to the result if it is the same object
* as the value corresponding to the initial argument in the pattern.
*
* @param values The argument values.
* An argument value may be the same object as result.
* Can be nullptr if valuesLength==getArgumentLimit()==0.
* @param valuesLength The length of the values array.
* Must be at least getArgumentLimit().
* @param result Gets its contents replaced by the formatted pattern and values.
* @param offsets offsets[i] receives the offset of where
* values[i] replaced pattern argument {i}.
* Can be shorter or longer than values. Can be nullptr if offsetsLength==0.
* If there is no {i} in the pattern, then offsets[i] is set to -1.
* @param offsetsLength The length of the offsets array.
* @param errorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return result
* @stable ICU 57
*/
UnicodeString &formatAndReplace(
const UnicodeString *const *values, int32_t valuesLength,
UnicodeString &result,
int32_t *offsets, int32_t offsetsLength, UErrorCode &errorCode) const;
/**
* Returns the pattern text with none of the arguments.
* Like formatting with all-empty string values.
* @stable ICU 57
*/
UnicodeString getTextWithNoArguments() const {
return getTextWithNoArguments(
compiledPattern.getBuffer(),
compiledPattern.length(),
nullptr,
0);
}
#ifndef U_HIDE_INTERNAL_API
/**
* Returns the pattern text with none of the arguments.
* Like formatting with all-empty string values.
*
* TODO(ICU-20406): Replace this with an Iterator interface.
*
* @param offsets offsets[i] receives the offset of where {i} was located
* before it was replaced by an empty string.
* For example, "a{0}b{1}" produces offset 1 for i=0 and 2 for i=1.
* Can be nullptr if offsetsLength==0.
* If there is no {i} in the pattern, then offsets[i] is set to -1.
* @param offsetsLength The length of the offsets array.
*
* @internal
*/
UnicodeString getTextWithNoArguments(int32_t *offsets, int32_t offsetsLength) const {
return getTextWithNoArguments(
compiledPattern.getBuffer(),
compiledPattern.length(),
offsets,
offsetsLength);
}
#endif // U_HIDE_INTERNAL_API
private:
/**
* Binary representation of the compiled pattern.
* Index 0: One more than the highest argument number.
* Followed by zero or more arguments or literal-text segments.
*
* An argument is stored as its number, less than ARG_NUM_LIMIT.
* A literal-text segment is stored as its length (at least 1) offset by ARG_NUM_LIMIT,
* followed by that many chars.
*/
UnicodeString compiledPattern;
static inline int32_t getArgumentLimit(const char16_t *compiledPattern,
int32_t compiledPatternLength) {
return compiledPatternLength == 0 ? 0 : compiledPattern[0];
}
static UnicodeString getTextWithNoArguments(
const char16_t *compiledPattern,
int32_t compiledPatternLength,
int32_t *offsets,
int32_t offsetsLength);
static UnicodeString &format(
const char16_t *compiledPattern, int32_t compiledPatternLength,
const UnicodeString *const *values,
UnicodeString &result, const UnicodeString *resultCopy, UBool forbidResultAsValue,
int32_t *offsets, int32_t offsetsLength,
UErrorCode &errorCode);
// Give access to internals to SimpleModifier for number formatting
friend class number::impl::SimpleModifier;
};
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // __SIMPLEFORMATTER_H__

View File

@@ -0,0 +1,41 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2009-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: std_string.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009feb19
* created by: Markus W. Scherer
*/
#ifndef __STD_STRING_H__
#define __STD_STRING_H__
/**
* \file
* \brief C++ API: Central ICU header for including the C++ standard &lt;string&gt;
* header and for related definitions.
*/
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
// Workaround for a libstdc++ bug before libstdc++4.6 (2011).
// https://bugs.llvm.org/show_bug.cgi?id=13364
#if defined(__GLIBCXX__)
namespace std { class type_info; }
#endif
#include <string>
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // __STD_STRING_H__

View File

@@ -0,0 +1,281 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2002-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
*/
#ifndef STRENUM_H
#define STRENUM_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/uobject.h"
#include "unicode/unistr.h"
/**
* \file
* \brief C++ API: String Enumeration
*/
U_NAMESPACE_BEGIN
/**
* Base class for 'pure' C++ implementations of uenum api. Adds a
* method that returns the next UnicodeString since in C++ this can
* be a common storage format for strings.
*
* <p>The model is that the enumeration is over strings maintained by
* a 'service.' At any point, the service might change, invalidating
* the enumerator (though this is expected to be rare). The iterator
* returns an error if this has occurred. Lack of the error is no
* guarantee that the service didn't change immediately after the
* call, so the returned string still might not be 'valid' on
* subsequent use.</p>
*
* <p>Strings may take the form of const char*, const char16_t*, or const
* UnicodeString*. The type you get is determine by the variant of
* 'next' that you call. In general the StringEnumeration is
* optimized for one of these types, but all StringEnumerations can
* return all types. Returned strings are each terminated with a NUL.
* Depending on the service data, they might also include embedded NUL
* characters, so API is provided to optionally return the true
* length, counting the embedded NULs but not counting the terminating
* NUL.</p>
*
* <p>The pointers returned by next, unext, and snext become invalid
* upon any subsequent call to the enumeration's destructor, next,
* unext, snext, or reset.</p>
*
* ICU 2.8 adds some default implementations and helper functions
* for subclasses.
*
* @stable ICU 2.4
*/
class U_COMMON_API StringEnumeration : public UObject {
public:
/**
* Destructor.
* @stable ICU 2.4
*/
virtual ~StringEnumeration();
/**
* Clone this object, an instance of a subclass of StringEnumeration.
* Clones can be used concurrently in multiple threads.
* If a subclass does not implement clone(), or if an error occurs,
* then nullptr is returned.
* The caller must delete the clone.
*
* @return a clone of this object
*
* @see getDynamicClassID
* @stable ICU 2.8
*/
virtual StringEnumeration *clone() const;
/**
* <p>Return the number of elements that the iterator traverses. If
* the iterator is out of sync with its service, status is set to
* U_ENUM_OUT_OF_SYNC_ERROR, and the return value is zero.</p>
*
* <p>The return value will not change except possibly as a result of
* a subsequent call to reset, or if the iterator becomes out of sync.</p>
*
* <p>This is a convenience function. It can end up being very
* expensive as all the items might have to be pre-fetched
* (depending on the storage format of the data being
* traversed).</p>
*
* @param status the error code.
* @return number of elements in the iterator.
*
* @stable ICU 2.4 */
virtual int32_t count(UErrorCode& status) const = 0;
/**
* <p>Returns the next element as a NUL-terminated char*. If there
* are no more elements, returns nullptr. If the resultLength pointer
* is not nullptr, the length of the string (not counting the
* terminating NUL) is returned at that address. If an error
* status is returned, the value at resultLength is undefined.</p>
*
* <p>The returned pointer is owned by this iterator and must not be
* deleted by the caller. The pointer is valid until the next call
* to next, unext, snext, reset, or the enumerator's destructor.</p>
*
* <p>If the iterator is out of sync with its service, status is set
* to U_ENUM_OUT_OF_SYNC_ERROR and nullptr is returned.</p>
*
* <p>If the native service string is a char16_t* string, it is
* converted to char* with the invariant converter. If the
* conversion fails (because a character cannot be converted) then
* status is set to U_INVARIANT_CONVERSION_ERROR and the return
* value is undefined (though not nullptr).</p>
*
* Starting with ICU 2.8, the default implementation calls snext()
* and handles the conversion.
* Either next() or snext() must be implemented differently by a subclass.
*
* @param status the error code.
* @param resultLength a pointer to receive the length, can be nullptr.
* @return a pointer to the string, or nullptr.
*
* @stable ICU 2.4
*/
virtual const char* next(int32_t *resultLength, UErrorCode& status);
/**
* <p>Returns the next element as a NUL-terminated char16_t*. If there
* are no more elements, returns nullptr. If the resultLength pointer
* is not nullptr, the length of the string (not counting the
* terminating NUL) is returned at that address. If an error
* status is returned, the value at resultLength is undefined.</p>
*
* <p>The returned pointer is owned by this iterator and must not be
* deleted by the caller. The pointer is valid until the next call
* to next, unext, snext, reset, or the enumerator's destructor.</p>
*
* <p>If the iterator is out of sync with its service, status is set
* to U_ENUM_OUT_OF_SYNC_ERROR and nullptr is returned.</p>
*
* Starting with ICU 2.8, the default implementation calls snext()
* and handles the conversion.
*
* @param status the error code.
* @param resultLength a pointer to receive the length, can be nullptr.
* @return a pointer to the string, or nullptr.
*
* @stable ICU 2.4
*/
virtual const char16_t* unext(int32_t *resultLength, UErrorCode& status);
/**
* <p>Returns the next element a UnicodeString*. If there are no
* more elements, returns nullptr.</p>
*
* <p>The returned pointer is owned by this iterator and must not be
* deleted by the caller. The pointer is valid until the next call
* to next, unext, snext, reset, or the enumerator's destructor.</p>
*
* <p>If the iterator is out of sync with its service, status is set
* to U_ENUM_OUT_OF_SYNC_ERROR and nullptr is returned.</p>
*
* Starting with ICU 2.8, the default implementation calls next()
* and handles the conversion.
* Either next() or snext() must be implemented differently by a subclass.
*
* @param status the error code.
* @return a pointer to the string, or nullptr.
*
* @stable ICU 2.4
*/
virtual const UnicodeString* snext(UErrorCode& status);
/**
* <p>Resets the iterator. This re-establishes sync with the
* service and rewinds the iterator to start at the first
* element.</p>
*
* <p>Previous pointers returned by next, unext, or snext become
* invalid, and the value returned by count might change.</p>
*
* @param status the error code.
*
* @stable ICU 2.4
*/
virtual void reset(UErrorCode& status) = 0;
/**
* Compares this enumeration to other to check if both are equal
*
* @param that The other string enumeration to compare this object to
* @return true if the enumerations are equal. false if not.
* @stable ICU 3.6
*/
virtual bool operator==(const StringEnumeration& that)const;
/**
* Compares this enumeration to other to check if both are not equal
*
* @param that The other string enumeration to compare this object to
* @return true if the enumerations are equal. false if not.
* @stable ICU 3.6
*/
virtual bool operator!=(const StringEnumeration& that)const;
protected:
/**
* UnicodeString field for use with default implementations and subclasses.
* @stable ICU 2.8
*/
UnicodeString unistr;
/**
* char * default buffer for use with default implementations and subclasses.
* @stable ICU 2.8
*/
char charsBuffer[32];
/**
* char * buffer for use with default implementations and subclasses.
* Allocated in constructor and in ensureCharsCapacity().
* @stable ICU 2.8
*/
char *chars;
/**
* Capacity of chars, for use with default implementations and subclasses.
* @stable ICU 2.8
*/
int32_t charsCapacity;
/**
* Default constructor for use with default implementations and subclasses.
* @stable ICU 2.8
*/
StringEnumeration();
/**
* Ensures that chars is at least as large as the requested capacity.
* For use with default implementations and subclasses.
*
* @param capacity Requested capacity.
* @param status ICU in/out error code.
* @stable ICU 2.8
*/
void ensureCharsCapacity(int32_t capacity, UErrorCode &status);
/**
* Converts s to Unicode and sets unistr to the result.
* For use with default implementations and subclasses,
* especially for implementations of snext() in terms of next().
* This is provided with a helper function instead of a default implementation
* of snext() to avoid potential infinite loops between next() and snext().
*
* For example:
* \code
* const UnicodeString* snext(UErrorCode& status) {
* int32_t resultLength=0;
* const char *s=next(&resultLength, status);
* return setChars(s, resultLength, status);
* }
* \endcode
*
* @param s String to be converted to Unicode.
* @param length Length of the string.
* @param status ICU in/out error code.
* @return A pointer to unistr.
* @stable ICU 2.8
*/
UnicodeString *setChars(const char *s, int32_t length, UErrorCode &status);
};
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
/* STRENUM_H */
#endif

View File

@@ -0,0 +1,190 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// stringoptions.h
// created: 2017jun08 Markus W. Scherer
#ifndef __STRINGOPTIONS_H__
#define __STRINGOPTIONS_H__
#include "unicode/utypes.h"
/**
* \file
* \brief C API: Bit set option bit constants for various string and character processing functions.
*/
/**
* Option value for case folding: Use default mappings defined in CaseFolding.txt.
*
* @stable ICU 2.0
*/
#define U_FOLD_CASE_DEFAULT 0
/**
* Option value for case folding:
*
* Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
* and dotless i appropriately for Turkic languages (tr, az).
*
* Before Unicode 3.2, CaseFolding.txt contains mappings marked with 'I' that
* are to be included for default mappings and
* excluded for the Turkic-specific mappings.
*
* Unicode 3.2 CaseFolding.txt instead contains mappings marked with 'T' that
* are to be excluded for default mappings and
* included for the Turkic-specific mappings.
*
* @stable ICU 2.0
*/
#define U_FOLD_CASE_EXCLUDE_SPECIAL_I 1
/**
* Titlecase the string as a whole rather than each word.
* (Titlecase only the character at index 0, possibly adjusted.)
* Option bits value for titlecasing APIs that take an options bit set.
*
* It is an error to specify multiple titlecasing iterator options together,
* including both an options bit and an explicit BreakIterator.
*
* @see U_TITLECASE_ADJUST_TO_CASED
* @stable ICU 60
*/
#define U_TITLECASE_WHOLE_STRING 0x20
/**
* Titlecase sentences rather than words.
* (Titlecase only the first character of each sentence, possibly adjusted.)
* Option bits value for titlecasing APIs that take an options bit set.
*
* It is an error to specify multiple titlecasing iterator options together,
* including both an options bit and an explicit BreakIterator.
*
* @see U_TITLECASE_ADJUST_TO_CASED
* @stable ICU 60
*/
#define U_TITLECASE_SENTENCES 0x40
/**
* Do not lowercase non-initial parts of words when titlecasing.
* Option bit for titlecasing APIs that take an options bit set.
*
* By default, titlecasing will titlecase the character at each
* (possibly adjusted) BreakIterator index and
* lowercase all other characters up to the next iterator index.
* With this option, the other characters will not be modified.
*
* @see U_TITLECASE_ADJUST_TO_CASED
* @see UnicodeString::toTitle
* @see CaseMap::toTitle
* @see ucasemap_setOptions
* @see ucasemap_toTitle
* @see ucasemap_utf8ToTitle
* @stable ICU 3.8
*/
#define U_TITLECASE_NO_LOWERCASE 0x100
/**
* Do not adjust the titlecasing BreakIterator indexes;
* titlecase exactly the characters at breaks from the iterator.
* Option bit for titlecasing APIs that take an options bit set.
*
* By default, titlecasing will take each break iterator index,
* adjust it to the next relevant character (see U_TITLECASE_ADJUST_TO_CASED),
* and titlecase that one.
*
* Other characters are lowercased.
*
* It is an error to specify multiple titlecasing adjustment options together.
*
* @see U_TITLECASE_ADJUST_TO_CASED
* @see U_TITLECASE_NO_LOWERCASE
* @see UnicodeString::toTitle
* @see CaseMap::toTitle
* @see ucasemap_setOptions
* @see ucasemap_toTitle
* @see ucasemap_utf8ToTitle
* @stable ICU 3.8
*/
#define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200
/**
* Adjust each titlecasing BreakIterator index to the next cased character.
* (See the Unicode Standard, chapter 3, Default Case Conversion, R3 toTitlecase(X).)
* Option bit for titlecasing APIs that take an options bit set.
*
* This used to be the default index adjustment in ICU.
* Since ICU 60, the default index adjustment is to the next character that is
* a letter, number, symbol, or private use code point.
* (Uncased modifier letters are skipped.)
* The difference in behavior is small for word titlecasing,
* but the new adjustment is much better for whole-string and sentence titlecasing:
* It yields "49ers" and "«丰(abc)»" instead of "49Ers" and "«丰(Abc)»".
*
* It is an error to specify multiple titlecasing adjustment options together.
*
* @see U_TITLECASE_NO_BREAK_ADJUSTMENT
* @stable ICU 60
*/
#define U_TITLECASE_ADJUST_TO_CASED 0x400
/**
* Option for string transformation functions to not first reset the Edits object.
* Used for example in some case-mapping and normalization functions.
*
* @see CaseMap
* @see Edits
* @see Normalizer2
* @stable ICU 60
*/
#define U_EDITS_NO_RESET 0x2000
/**
* Omit unchanged text when recording how source substrings
* relate to changed and unchanged result substrings.
* Used for example in some case-mapping and normalization functions.
*
* @see CaseMap
* @see Edits
* @see Normalizer2
* @stable ICU 60
*/
#define U_OMIT_UNCHANGED_TEXT 0x4000
/**
* Option bit for u_strCaseCompare, u_strcasecmp, unorm_compare, etc:
* Compare strings in code point order instead of code unit order.
* @stable ICU 2.2
*/
#define U_COMPARE_CODE_POINT_ORDER 0x8000
/**
* Option bit for unorm_compare:
* Perform case-insensitive comparison.
* @stable ICU 2.2
*/
#define U_COMPARE_IGNORE_CASE 0x10000
/**
* Option bit for unorm_compare:
* Both input strings are assumed to fulfill FCD conditions.
* @stable ICU 2.2
*/
#define UNORM_INPUT_IS_FCD 0x20000
// Related definitions elsewhere.
// Options that are not meaningful in the same functions
// can share the same bits.
//
// Public:
// unicode/unorm.h #define UNORM_COMPARE_NORM_OPTIONS_SHIFT 20
//
// Internal: (may change or be removed)
// ucase.h #define _STRCASECMP_OPTIONS_MASK 0xffff
// ucase.h #define _FOLD_CASE_OPTIONS_MASK 7
// ucasemap_imp.h #define U_TITLECASE_ITERATOR_MASK 0xe0
// ucasemap_imp.h #define U_TITLECASE_ADJUSTMENT_MASK 0x600
// ustr_imp.h #define _STRNCMP_STYLE 0x1000
// unormcmp.cpp #define _COMPARE_EQUIV 0x80000
#endif // __STRINGOPTIONS_H__

View File

@@ -0,0 +1,354 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// Copyright (C) 2009-2013, International Business Machines
// Corporation and others. All Rights Reserved.
//
// Copyright 2001 and onwards Google Inc.
// Author: Sanjay Ghemawat
// This code is a contribution of Google code, and the style used here is
// a compromise between the original Google code and the ICU coding guidelines.
// For example, data types are ICU-ified (size_t,int->int32_t),
// and API comments doxygen-ified, but function names and behavior are
// as in the original, if possible.
// Assertion-style error handling, not available in ICU, was changed to
// parameter "pinning" similar to UnicodeString.
//
// In addition, this is only a partial port of the original Google code,
// limited to what was needed so far. The (nearly) complete original code
// is in the ICU svn repository at icuhtml/trunk/design/strings/contrib
// (see ICU ticket 6765, r25517).
#ifndef __STRINGPIECE_H__
#define __STRINGPIECE_H__
/**
* \file
* \brief C++ API: StringPiece: Read-only byte string wrapper class.
*/
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include <cstddef>
#include <string_view>
#include <type_traits>
#include "unicode/uobject.h"
#include "unicode/std_string.h"
// Arghh! I wish C++ literals were "string".
U_NAMESPACE_BEGIN
/**
* A string-like object that points to a sized piece of memory.
*
* We provide non-explicit singleton constructors so users can pass
* in a "const char*" or a "string" wherever a "StringPiece" is
* expected.
*
* Functions or methods may use StringPiece parameters to accept either a
* "const char*" or a "string" value that will be implicitly converted to a
* StringPiece.
*
* Systematic usage of StringPiece is encouraged as it will reduce unnecessary
* conversions from "const char*" to "string" and back again.
*
* @stable ICU 4.2
*/
class U_COMMON_API StringPiece : public UMemory {
private:
const char* ptr_;
int32_t length_;
public:
/**
* Default constructor, creates an empty StringPiece.
* @stable ICU 4.2
*/
StringPiece() : ptr_(nullptr), length_(0) { }
/**
* Constructs from a NUL-terminated const char * pointer.
* @param str a NUL-terminated const char * pointer
* @stable ICU 4.2
*/
StringPiece(const char* str);
#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
/**
* Constructs from a NUL-terminated const char8_t * pointer.
* @param str a NUL-terminated const char8_t * pointer
* @stable ICU 67
*/
StringPiece(const char8_t* str) : StringPiece(reinterpret_cast<const char*>(str)) {}
#endif
/**
* Constructs an empty StringPiece.
* Needed for type disambiguation from multiple other overloads.
* @param p nullptr
* @stable ICU 67
*/
StringPiece(std::nullptr_t p) : ptr_(p), length_(0) {}
/**
* Constructs from a std::string.
* @stable ICU 4.2
*/
StringPiece(const std::string& str)
: ptr_(str.data()), length_(static_cast<int32_t>(str.size())) { }
#if defined(__cpp_lib_char8_t) || defined(U_IN_DOXYGEN)
/**
* Constructs from a std::u8string.
* @stable ICU 67
*/
StringPiece(const std::u8string& str)
: ptr_(reinterpret_cast<const char*>(str.data())),
length_(static_cast<int32_t>(str.size())) { }
#endif
/**
* Constructs from some other implementation of a string piece class, from any
* C++ record type that has these two methods:
*
* \code{.cpp}
*
* struct OtherStringPieceClass {
* const char* data(); // or const char8_t*
* size_t size();
* };
*
* \endcode
*
* The other string piece class will typically be std::string_view from C++17
* or absl::string_view from Abseil.
*
* Starting with C++20, data() may also return a const char8_t* pointer,
* as from std::u8string_view.
*
* @param str the other string piece
* @stable ICU 65
*/
template <typename T,
typename = std::enable_if_t<
(std::is_same_v<decltype(T().data()), const char*>
#if defined(__cpp_char8_t)
|| std::is_same_v<decltype(T().data()), const char8_t*>
#endif
) &&
std::is_same_v<decltype(T().size()), size_t>>>
StringPiece(T str)
: ptr_(reinterpret_cast<const char*>(str.data())),
length_(static_cast<int32_t>(str.size())) {}
/**
* Constructs from a const char * pointer and a specified length.
* @param offset a const char * pointer (need not be terminated)
* @param len the length of the string; must be non-negative
* @stable ICU 4.2
*/
StringPiece(const char* offset, int32_t len) : ptr_(offset), length_(len) { }
#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
/**
* Constructs from a const char8_t * pointer and a specified length.
* @param str a const char8_t * pointer (need not be terminated)
* @param len the length of the string; must be non-negative
* @stable ICU 67
*/
StringPiece(const char8_t* str, int32_t len) :
StringPiece(reinterpret_cast<const char*>(str), len) {}
#endif
/**
* Substring of another StringPiece.
* @param x the other StringPiece
* @param pos start position in x; must be non-negative and <= x.length().
* @stable ICU 4.2
*/
StringPiece(const StringPiece& x, int32_t pos);
/**
* Substring of another StringPiece.
* @param x the other StringPiece
* @param pos start position in x; must be non-negative and <= x.length().
* @param len length of the substring;
* must be non-negative and will be pinned to at most x.length() - pos.
* @stable ICU 4.2
*/
StringPiece(const StringPiece& x, int32_t pos, int32_t len);
#ifndef U_HIDE_INTERNAL_API
/**
* Converts to a std::string_view().
* @internal
*/
inline operator std::string_view() const {
return {data(), static_cast<std::string_view::size_type>(size())};
}
#endif // U_HIDE_INTERNAL_API
/**
* Returns the string pointer. May be nullptr if it is empty.
*
* data() may return a pointer to a buffer with embedded NULs, and the
* returned buffer may or may not be null terminated. Therefore it is
* typically a mistake to pass data() to a routine that expects a NUL
* terminated string.
* @return the string pointer
* @stable ICU 4.2
*/
const char* data() const { return ptr_; }
/**
* Returns the string length. Same as length().
* @return the string length
* @stable ICU 4.2
*/
int32_t size() const { return length_; }
/**
* Returns the string length. Same as size().
* @return the string length
* @stable ICU 4.2
*/
int32_t length() const { return length_; }
/**
* Returns whether the string is empty.
* @return true if the string is empty
* @stable ICU 4.2
*/
UBool empty() const { return length_ == 0; }
/**
* Sets to an empty string.
* @stable ICU 4.2
*/
void clear() { ptr_ = nullptr; length_ = 0; }
/**
* Reset the stringpiece to refer to new data.
* @param xdata pointer the new string data. Need not be nul terminated.
* @param len the length of the new data
* @stable ICU 4.8
*/
void set(const char* xdata, int32_t len) { ptr_ = xdata; length_ = len; }
/**
* Reset the stringpiece to refer to new data.
* @param str a pointer to a NUL-terminated string.
* @stable ICU 4.8
*/
void set(const char* str);
#if defined(__cpp_char8_t) || defined(U_IN_DOXYGEN)
/**
* Resets the stringpiece to refer to new data.
* @param xdata pointer the new string data. Need not be NUL-terminated.
* @param len the length of the new data
* @stable ICU 67
*/
inline void set(const char8_t* xdata, int32_t len) {
set(reinterpret_cast<const char*>(xdata), len);
}
/**
* Resets the stringpiece to refer to new data.
* @param str a pointer to a NUL-terminated string.
* @stable ICU 67
*/
inline void set(const char8_t* str) {
set(reinterpret_cast<const char*>(str));
}
#endif
/**
* Removes the first n string units.
* @param n prefix length, must be non-negative and <=length()
* @stable ICU 4.2
*/
void remove_prefix(int32_t n) {
if (n >= 0) {
if (n > length_) {
n = length_;
}
ptr_ += n;
length_ -= n;
}
}
/**
* Removes the last n string units.
* @param n suffix length, must be non-negative and <=length()
* @stable ICU 4.2
*/
void remove_suffix(int32_t n) {
if (n >= 0) {
if (n <= length_) {
length_ -= n;
} else {
length_ = 0;
}
}
}
/**
* Searches the StringPiece for the given search string (needle);
* @param needle The string for which to search.
* @param offset Where to start searching within this string (haystack).
* @return The offset of needle in haystack, or -1 if not found.
* @stable ICU 67
*/
int32_t find(StringPiece needle, int32_t offset);
/**
* Compares this StringPiece with the other StringPiece, with semantics
* similar to std::string::compare().
* @param other The string to compare to.
* @return below zero if this < other; above zero if this > other; 0 if this == other.
* @stable ICU 67
*/
int32_t compare(StringPiece other);
/**
* Maximum integer, used as a default value for substring methods.
* @stable ICU 4.2
*/
static const int32_t npos; // = 0x7fffffff;
/**
* Returns a substring of this StringPiece.
* @param pos start position; must be non-negative and <= length().
* @param len length of the substring;
* must be non-negative and will be pinned to at most length() - pos.
* @return the substring StringPiece
* @stable ICU 4.2
*/
StringPiece substr(int32_t pos, int32_t len = npos) const {
return StringPiece(*this, pos, len);
}
};
/**
* Global operator == for StringPiece
* @param x The first StringPiece to compare.
* @param y The second StringPiece to compare.
* @return true if the string data is equal
* @stable ICU 4.8
*/
U_EXPORT UBool U_EXPORT2
operator==(const StringPiece& x, const StringPiece& y);
/**
* Global operator != for StringPiece
* @param x The first StringPiece to compare.
* @param y The second StringPiece to compare.
* @return true if the string data is not equal
* @stable ICU 4.8
*/
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
return !(x == y);
}
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // __STRINGPIECE_H__

View File

@@ -0,0 +1,426 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2010-2012,2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: stringtriebuilder.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010dec24
* created by: Markus W. Scherer
*/
#ifndef __STRINGTRIEBUILDER_H__
#define __STRINGTRIEBUILDER_H__
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/uobject.h"
/**
* \file
* \brief C++ API: Builder API for trie builders
*/
// Forward declaration.
/// \cond
struct UHashtable;
typedef struct UHashtable UHashtable;
/// \endcond
/**
* Build options for BytesTrieBuilder and CharsTrieBuilder.
* @stable ICU 4.8
*/
enum UStringTrieBuildOption {
/**
* Builds a trie quickly.
* @stable ICU 4.8
*/
USTRINGTRIE_BUILD_FAST,
/**
* Builds a trie more slowly, attempting to generate
* a shorter but equivalent serialization.
* This build option also uses more memory.
*
* This option can be effective when many integer values are the same
* and string/byte sequence suffixes can be shared.
* Runtime speed is not expected to improve.
* @stable ICU 4.8
*/
USTRINGTRIE_BUILD_SMALL
};
U_NAMESPACE_BEGIN
/**
* Base class for string trie builder classes.
*
* This class is not intended for public subclassing.
* @stable ICU 4.8
*/
class U_COMMON_API StringTrieBuilder : public UObject {
public:
#ifndef U_HIDE_INTERNAL_API
/** @internal */
static int32_t hashNode(const void *node);
/** @internal */
static UBool equalNodes(const void *left, const void *right);
#endif /* U_HIDE_INTERNAL_API */
protected:
// Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
// or else the compiler will create a public default constructor.
/** @internal */
StringTrieBuilder();
/** @internal */
virtual ~StringTrieBuilder();
#ifndef U_HIDE_INTERNAL_API
/** @internal */
void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
/** @internal */
void deleteCompactBuilder();
/** @internal */
void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
/** @internal */
int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
/** @internal */
int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
#endif /* U_HIDE_INTERNAL_API */
class Node;
#ifndef U_HIDE_INTERNAL_API
/** @internal */
Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
/** @internal */
Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
int32_t length, UErrorCode &errorCode);
#endif /* U_HIDE_INTERNAL_API */
/** @internal */
virtual int32_t getElementStringLength(int32_t i) const = 0;
/** @internal */
virtual char16_t getElementUnit(int32_t i, int32_t unitIndex) const = 0;
/** @internal */
virtual int32_t getElementValue(int32_t i) const = 0;
// Finds the first unit index after this one where
// the first and last element have different units again.
/** @internal */
virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
// Number of different units at unitIndex.
/** @internal */
virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
/** @internal */
virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
/** @internal */
virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const = 0;
/** @internal */
virtual UBool matchNodesCanHaveValues() const = 0;
/** @internal */
virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
/** @internal */
virtual int32_t getMinLinearMatch() const = 0;
/** @internal */
virtual int32_t getMaxLinearMatchLength() const = 0;
#ifndef U_HIDE_INTERNAL_API
// max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
/** @internal */
static const int32_t kMaxBranchLinearSubNodeLength=5;
// Maximum number of nested split-branch levels for a branch on all 2^16 possible char16_t units.
// log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
/** @internal */
static const int32_t kMaxSplitBranchLevels=14;
/**
* Makes sure that there is only one unique node registered that is
* equivalent to newNode.
* @param newNode Input node. The builder takes ownership.
* @param errorCode ICU in/out UErrorCode.
Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==nullptr.
* @return newNode if it is the first of its kind, or
* an equivalent node if newNode is a duplicate.
* @internal
*/
Node *registerNode(Node *newNode, UErrorCode &errorCode);
/**
* Makes sure that there is only one unique FinalValueNode registered
* with this value.
* Avoids creating a node if the value is a duplicate.
* @param value A final value.
* @param errorCode ICU in/out UErrorCode.
Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==nullptr.
* @return A FinalValueNode with the given value.
* @internal
*/
Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
#endif /* U_HIDE_INTERNAL_API */
/*
* C++ note:
* registerNode() and registerFinalValue() take ownership of their input nodes,
* and only return owned nodes.
* If they see a failure UErrorCode, they will delete the input node.
* If they get a nullptr pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
* If there is a failure, they return nullptr.
*
* nullptr Node pointers can be safely passed into other Nodes because
* they call the static Node::hashCode() which checks for a nullptr pointer first.
*
* Therefore, as long as builder functions register a new node,
* they need to check for failures only before explicitly dereferencing
* a Node pointer, or before setting a new UErrorCode.
*/
// Hash set of nodes, maps from nodes to integer 1.
/** @internal */
UHashtable *nodes;
// Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
// it is needed for layout of other objects.
/**
* @internal
* \cond
*/
class Node : public UObject {
public:
Node(int32_t initialHash) : hash(initialHash), offset(0) {}
inline int32_t hashCode() const { return hash; }
// Handles node==nullptr.
static inline int32_t hashCode(const Node *node) { return node==nullptr ? 0 : node->hashCode(); }
// Base class operator==() compares the actual class types.
virtual bool operator==(const Node &other) const;
inline bool operator!=(const Node &other) const { return !operator==(other); }
/**
* Traverses the Node graph and numbers branch edges, with rightmost edges first.
* This is to avoid writing a duplicate node twice.
*
* Branch nodes in this trie data structure are not symmetric.
* Most branch edges "jump" to other nodes but the rightmost branch edges
* just continue without a jump.
* Therefore, write() must write the rightmost branch edge last
* (trie units are written backwards), and must write it at that point even if
* it is a duplicate of a node previously written elsewhere.
*
* This function visits and marks right branch edges first.
* Edges are numbered with increasingly negative values because we share the
* offset field which gets positive values when nodes are written.
* A branch edge also remembers the first number for any of its edges.
*
* When a further-left branch edge has a number in the range of the rightmost
* edge's numbers, then it will be written as part of the required right edge
* and we can avoid writing it first.
*
* After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
* edge numbers.
*
* @param edgeNumber The first edge number for this node and its sub-nodes.
* @return An edge number that is at least the maximum-negative
* of the input edge number and the numbers of this node and all of its sub-nodes.
*/
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
// write() must set the offset to a positive value.
virtual void write(StringTrieBuilder &builder) = 0;
// See markRightEdgesFirst.
inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
StringTrieBuilder &builder) {
// Note: Edge numbers are negative, lastRight<=firstRight.
// If offset>0 then this node and its sub-nodes have been written already
// and we need not write them again.
// If this node is part of the unwritten right branch edge,
// then we wait until that is written.
if(offset<0 && (offset<lastRight || firstRight<offset)) {
write(builder);
}
}
inline int32_t getOffset() const { return offset; }
protected:
int32_t hash;
int32_t offset;
};
#ifndef U_HIDE_INTERNAL_API
// This class should not be overridden because
// registerFinalValue() compares a stack-allocated FinalValueNode
// (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
// with the input node, and the
// !Node::operator==(other) used inside FinalValueNode::operator==(other)
// will be false if the typeid's are different.
/** @internal */
class FinalValueNode : public Node {
public:
FinalValueNode(int32_t v) : Node(0x111111u*37u+v), value(v) {}
virtual bool operator==(const Node &other) const override;
virtual void write(StringTrieBuilder &builder) override;
protected:
int32_t value;
};
#endif /* U_HIDE_INTERNAL_API */
// Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
// it is needed for layout of other objects.
/**
* @internal
*/
class ValueNode : public Node {
public:
ValueNode(int32_t initialHash) : Node(initialHash), hasValue(false), value(0) {}
virtual bool operator==(const Node &other) const override;
void setValue(int32_t v) {
hasValue=true;
value=v;
hash=hash*37u+v;
}
protected:
UBool hasValue;
int32_t value;
};
#ifndef U_HIDE_INTERNAL_API
/**
* @internal
*/
class IntermediateValueNode : public ValueNode {
public:
IntermediateValueNode(int32_t v, Node *nextNode)
: ValueNode(0x222222u*37u+hashCode(nextNode)), next(nextNode) { setValue(v); }
virtual bool operator==(const Node &other) const override;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
virtual void write(StringTrieBuilder &builder) override;
protected:
Node *next;
};
#endif /* U_HIDE_INTERNAL_API */
// Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
// it is needed for layout of other objects.
/**
* @internal
*/
class LinearMatchNode : public ValueNode {
public:
LinearMatchNode(int32_t len, Node *nextNode)
: ValueNode((0x333333u*37u+len)*37u+hashCode(nextNode)),
length(len), next(nextNode) {}
virtual bool operator==(const Node &other) const override;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
protected:
int32_t length;
Node *next;
};
#ifndef U_HIDE_INTERNAL_API
/**
* @internal
*/
class BranchNode : public Node {
public:
BranchNode(int32_t initialHash) : Node(initialHash) {}
protected:
int32_t firstEdgeNumber;
};
/**
* @internal
*/
class ListBranchNode : public BranchNode {
public:
ListBranchNode() : BranchNode(0x444444), length(0) {}
virtual bool operator==(const Node &other) const override;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
virtual void write(StringTrieBuilder &builder) override;
// Adds a unit with a final value.
void add(int32_t c, int32_t value) {
units[length] = static_cast<char16_t>(c);
equal[length]=nullptr;
values[length]=value;
++length;
hash=(hash*37u+c)*37u+value;
}
// Adds a unit which leads to another match node.
void add(int32_t c, Node *node) {
units[length] = static_cast<char16_t>(c);
equal[length]=node;
values[length]=0;
++length;
hash=(hash*37u+c)*37u+hashCode(node);
}
protected:
Node *equal[kMaxBranchLinearSubNodeLength]; // nullptr means "has final value".
int32_t length;
int32_t values[kMaxBranchLinearSubNodeLength];
char16_t units[kMaxBranchLinearSubNodeLength];
};
/**
* @internal
*/
class SplitBranchNode : public BranchNode {
public:
SplitBranchNode(char16_t middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
: BranchNode(((0x555555u*37u+middleUnit)*37u+
hashCode(lessThanNode))*37u+hashCode(greaterOrEqualNode)),
unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
virtual bool operator==(const Node &other) const override;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
virtual void write(StringTrieBuilder &builder) override;
protected:
char16_t unit;
Node *lessThan;
Node *greaterOrEqual;
};
// Branch head node, for writing the actual node lead unit.
/** @internal */
class BranchHeadNode : public ValueNode {
public:
BranchHeadNode(int32_t len, Node *subNode)
: ValueNode((0x666666u*37u+len)*37u+hashCode(subNode)),
length(len), next(subNode) {}
virtual bool operator==(const Node &other) const override;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
virtual void write(StringTrieBuilder &builder) override;
protected:
int32_t length;
Node *next; // A branch sub-node.
};
#endif /* U_HIDE_INTERNAL_API */
/// \endcond
/** @internal */
virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
Node *nextNode) const = 0;
/** @internal */
virtual int32_t write(int32_t unit) = 0;
/** @internal */
virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
/** @internal */
virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
/** @internal */
virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
/** @internal */
virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
};
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // __STRINGTRIEBUILDER_H__

View File

@@ -0,0 +1,119 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (c) 2000-2005, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 02/04/00 aliu Creation.
**********************************************************************
*/
#ifndef SYMTABLE_H
#define SYMTABLE_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/uobject.h"
/**
* \file
* \brief C++ API: An interface that defines both lookup protocol and parsing of
* symbolic names.
*/
U_NAMESPACE_BEGIN
class ParsePosition;
class UnicodeFunctor;
class UnicodeSet;
class UnicodeString;
/**
* An interface that defines both lookup protocol and parsing of
* symbolic names.
*
* <p>A symbol table maintains two kinds of mappings. The first is
* between symbolic names and their values. For example, if the
* variable with the name "start" is set to the value "alpha"
* (perhaps, though not necessarily, through an expression such as
* "$start=alpha"), then the call lookup("start") will return the
* char[] array ['a', 'l', 'p', 'h', 'a'].
*
* <p>The second kind of mapping is between character values and
* UnicodeMatcher objects. This is used by RuleBasedTransliterator,
* which uses characters in the private use area to represent objects
* such as UnicodeSets. If U+E015 is mapped to the UnicodeSet [a-z],
* then lookupMatcher(0xE015) will return the UnicodeSet [a-z].
*
* <p>Finally, a symbol table defines parsing behavior for symbolic
* names. All symbolic names start with the SYMBOL_REF character.
* When a parser encounters this character, it calls parseReference()
* with the position immediately following the SYMBOL_REF. The symbol
* table parses the name, if there is one, and returns it.
*
* @stable ICU 2.8
*/
class U_COMMON_API SymbolTable /* not : public UObject because this is an interface/mixin class */ {
public:
/**
* The character preceding a symbol reference name.
* @stable ICU 2.8
*/
enum { SYMBOL_REF = 0x0024 /*$*/ };
/**
* Destructor.
* @stable ICU 2.8
*/
virtual ~SymbolTable();
/**
* Lookup the characters associated with this string and return it.
* Return <tt>nullptr</tt> if no such name exists. The resultant
* string may have length zero.
* @param s the symbolic name to lookup
* @return a string containing the name's value, or <tt>nullptr</tt> if
* there is no mapping for s.
* @stable ICU 2.8
*/
virtual const UnicodeString* lookup(const UnicodeString& s) const = 0;
/**
* Lookup the UnicodeMatcher associated with the given character, and
* return it. Return <tt>nullptr</tt> if not found.
* @param ch a 32-bit code point from 0 to 0x10FFFF inclusive.
* @return the UnicodeMatcher object represented by the given
* character, or nullptr if there is no mapping for ch.
* @stable ICU 2.8
*/
virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const = 0;
/**
* Parse a symbol reference name from the given string, starting
* at the given position. If no valid symbol reference name is
* found, return the empty string and leave pos unchanged. That is, if the
* character at pos cannot start a name, or if pos is at or after
* text.length(), then return an empty string. This indicates an
* isolated SYMBOL_REF character.
* @param text the text to parse for the name
* @param pos on entry, the index of the first character to parse.
* This is the character following the SYMBOL_REF character. On
* exit, the index after the last parsed character. If the parse
* failed, pos is unchanged on exit.
* @param limit the index after the last character to be parsed.
* @return the parsed name, or an empty string if there is no
* valid symbolic name at the given position.
* @stable ICU 2.8
*/
virtual UnicodeString parseReference(const UnicodeString& text,
ParsePosition& pos, int32_t limit) const = 0;
};
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif

2211
thirdparty/icu4c/common/unicode/ubidi.h vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,326 @@
/*
******************************************************************************
*
* © 2016 and later: Unicode, Inc. and others.
* License & terms of use: http://www.unicode.org/copyright.html
*
******************************************************************************
* file name: ubiditransform.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2016jul24
* created by: Lina Kemmel
*
*/
#ifndef UBIDITRANSFORM_H
#define UBIDITRANSFORM_H
#include "unicode/utypes.h"
#include "unicode/ubidi.h"
#include "unicode/uchar.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/localpointer.h"
#endif // U_SHOW_CPLUSPLUS_API
/**
* \file
* \brief C API: Bidi Transformations
*/
/**
* `UBiDiOrder` indicates the order of text.
*
* This bidi transformation engine supports all possible combinations (4 in
* total) of input and output text order:
*
* - <logical input, visual output>: unless the output direction is RTL, this
* corresponds to a normal operation of the Bidi algorithm as described in the
* Unicode Technical Report and implemented by `UBiDi` when the
* reordering mode is set to `UBIDI_REORDER_DEFAULT`. Visual RTL
* mode is not supported by `UBiDi` and is accomplished through
* reversing a visual LTR string,
*
* - <visual input, logical output>: unless the input direction is RTL, this
* corresponds to an "inverse bidi algorithm" in `UBiDi` with the
* reordering mode set to `UBIDI_REORDER_INVERSE_LIKE_DIRECT`.
* Visual RTL mode is not not supported by `UBiDi` and is
* accomplished through reversing a visual LTR string,
*
* - <logical input, logical output>: if the input and output base directions
* mismatch, this corresponds to the `UBiDi` implementation with the
* reordering mode set to `UBIDI_REORDER_RUNS_ONLY`; and if the
* input and output base directions are identical, the transformation engine
* will only handle character mirroring and Arabic shaping operations without
* reordering,
*
* - <visual input, visual output>: this reordering mode is not supported by
* the `UBiDi` engine; it implies character mirroring, Arabic
* shaping, and - if the input/output base directions mismatch - string
* reverse operations.
* @see ubidi_setInverse
* @see ubidi_setReorderingMode
* @see UBIDI_REORDER_DEFAULT
* @see UBIDI_REORDER_INVERSE_LIKE_DIRECT
* @see UBIDI_REORDER_RUNS_ONLY
* @stable ICU 58
*/
typedef enum {
/** 0: Constant indicating a logical order.
* This is the default for input text.
* @stable ICU 58
*/
UBIDI_LOGICAL = 0,
/** 1: Constant indicating a visual order.
* This is a default for output text.
* @stable ICU 58
*/
UBIDI_VISUAL
} UBiDiOrder;
/**
* <code>UBiDiMirroring</code> indicates whether or not characters with the
* "mirrored" property in RTL runs should be replaced with their mirror-image
* counterparts.
* @see UBIDI_DO_MIRRORING
* @see ubidi_setReorderingOptions
* @see ubidi_writeReordered
* @see ubidi_writeReverse
* @stable ICU 58
*/
typedef enum {
/** 0: Constant indicating that character mirroring should not be
* performed.
* This is the default.
* @stable ICU 58
*/
UBIDI_MIRRORING_OFF = 0,
/** 1: Constant indicating that character mirroring should be performed.
* This corresponds to calling <code>ubidi_writeReordered</code> or
* <code>ubidi_writeReverse</code> with the
* <code>UBIDI_DO_MIRRORING</code> option bit set.
* @stable ICU 58
*/
UBIDI_MIRRORING_ON
} UBiDiMirroring;
/**
* Forward declaration of the <code>UBiDiTransform</code> structure that stores
* information used by the layout transformation engine.
* @stable ICU 58
*/
typedef struct UBiDiTransform UBiDiTransform;
/**
* Performs transformation of text from the bidi layout defined by the input
* ordering scheme to the bidi layout defined by the output ordering scheme,
* and applies character mirroring and Arabic shaping operations.<p>
* In terms of <code>UBiDi</code>, such a transformation implies:
* <ul>
* <li>calling <code>ubidi_setReorderingMode</code> as needed (when the
* reordering mode is other than normal),</li>
* <li>calling <code>ubidi_setInverse</code> as needed (when text should be
* transformed from a visual to a logical form),</li>
* <li>resolving embedding levels of each character in the input text by
* calling <code>ubidi_setPara</code>,</li>
* <li>reordering the characters based on the computed embedding levels, also
* performing character mirroring as needed, and streaming the result to the
* output, by calling <code>ubidi_writeReordered</code>,</li>
* <li>performing Arabic digit and letter shaping on the output text by calling
* <code>u_shapeArabic</code>.</li>
* </ul>
* An "ordering scheme" encompasses the base direction and the order of text,
* and these characteristics must be defined by the caller for both input and
* output explicitly .<p>
* There are 36 possible combinations of <input, output> ordering schemes,
* which are partially supported by <code>UBiDi</code> already. Examples of the
* currently supported combinations:
* <ul>
* <li><Logical LTR, Visual LTR>: this is equivalent to calling
* <code>ubidi_setPara</code> with <code>paraLevel == UBIDI_LTR</code>,</li>
* <li><Logical RTL, Visual LTR>: this is equivalent to calling
* <code>ubidi_setPara</code> with <code>paraLevel == UBIDI_RTL</code>,</li>
* <li><Logical Default ("Auto") LTR, Visual LTR>: this is equivalent to
* calling <code>ubidi_setPara</code> with
* <code>paraLevel == UBIDI_DEFAULT_LTR</code>,</li>
* <li><Logical Default ("Auto") RTL, Visual LTR>: this is equivalent to
* calling <code>ubidi_setPara</code> with
* <code>paraLevel == UBIDI_DEFAULT_RTL</code>,</li>
* <li><Visual LTR, Logical LTR>: this is equivalent to
* calling <code>ubidi_setInverse(UBiDi*, true)</code> and then
* <code>ubidi_setPara</code> with <code>paraLevel == UBIDI_LTR</code>,</li>
* <li><Visual LTR, Logical RTL>: this is equivalent to
* calling <code>ubidi_setInverse(UBiDi*, true)</code> and then
* <code>ubidi_setPara</code> with <code>paraLevel == UBIDI_RTL</code>.</li>
* </ul>
* All combinations that involve the Visual RTL scheme are unsupported by
* <code>UBiDi</code>, for instance:
* <ul>
* <li><Logical LTR, Visual RTL>,</li>
* <li><Visual RTL, Logical RTL>.</li>
* </ul>
* <p>Example of usage of the transformation engine:<br>
* <pre>
* \code
* UChar text1[] = {'a', 'b', 'c', 0x0625, '1', 0};
* UChar text2[] = {'a', 'b', 'c', 0x0625, '1', 0};
* UErrorCode errorCode = U_ZERO_ERROR;
* // Run a transformation.
* ubiditransform_transform(pBidiTransform,
* text1, -1, text2, -1,
* UBIDI_LTR, UBIDI_VISUAL,
* UBIDI_RTL, UBIDI_LOGICAL,
* UBIDI_MIRRORING_OFF,
* U_SHAPE_DIGITS_AN2EN | U_SHAPE_DIGIT_TYPE_AN_EXTENDED,
* &errorCode);
* // Do something with text2.
* text2[4] = '2';
* // Run a reverse transformation.
* ubiditransform_transform(pBidiTransform,
* text2, -1, text1, -1,
* UBIDI_RTL, UBIDI_LOGICAL,
* UBIDI_LTR, UBIDI_VISUAL,
* UBIDI_MIRRORING_OFF,
* U_SHAPE_DIGITS_EN2AN | U_SHAPE_DIGIT_TYPE_AN_EXTENDED,
* &errorCode);
*\endcode
* </pre>
* </p>
*
* @param pBiDiTransform A pointer to a <code>UBiDiTransform</code> object
* allocated with <code>ubiditransform_open()</code> or
* <code>NULL</code>.<p>
* This object serves for one-time setup to amortize initialization
* overheads. Use of this object is not thread-safe. All other threads
* should allocate a new <code>UBiDiTransform</code> object by calling
* <code>ubiditransform_open()</code> before using it. Alternatively,
* a caller can set this parameter to <code>NULL</code>, in which case
* the object will be allocated by the engine on the fly.</p>
* @param src A pointer to the text that the Bidi layout transformations will
* be performed on.
* <p><strong>Note:</strong> the text must be (at least)
* <code>srcLength</code> long.</p>
* @param srcLength The length of the text, in number of UChars. If
* <code>length == -1</code> then the text must be zero-terminated.
* @param dest A pointer to where the processed text is to be copied.
* @param destSize The size of the <code>dest</code> buffer, in number of
* UChars. If the <code>U_SHAPE_LETTERS_UNSHAPE</code> option is set,
* then the destination length could be as large as
* <code>srcLength * 2</code>. Otherwise, the destination length will
* not exceed <code>srcLength</code>. If the caller reserves the last
* position for zero-termination, it should be excluded from
* <code>destSize</code>.
* <p><code>destSize == -1</code> is allowed and makes sense when
* <code>dest</code> was holds some meaningful value, e.g. that of
* <code>src</code>. In this case <code>dest</code> must be
* zero-terminated.</p>
* @param inParaLevel A base embedding level of the input as defined in
* <code>ubidi_setPara</code> documentation for the
* <code>paraLevel</code> parameter.
* @param inOrder An order of the input, which can be one of the
* <code>UBiDiOrder</code> values.
* @param outParaLevel A base embedding level of the output as defined in
* <code>ubidi_setPara</code> documentation for the
* <code>paraLevel</code> parameter.
* @param outOrder An order of the output, which can be one of the
* <code>UBiDiOrder</code> values.
* @param doMirroring Indicates whether or not to perform character mirroring,
* and can accept one of the <code>UBiDiMirroring</code> values.
* @param shapingOptions Arabic digit and letter shaping options defined in the
* ushape.h documentation.
* <p><strong>Note:</strong> Direction indicator options are computed by
* the transformation engine based on the effective ordering schemes, so
* user-defined direction indicators will be ignored.</p>
* @param pErrorCode A pointer to an error code value.
*
* @return The destination length, i.e. the number of UChars written to
* <code>dest</code>. If the transformation fails, the return value
* will be 0 (and the error code will be written to
* <code>pErrorCode</code>).
*
* @see UBiDiLevel
* @see UBiDiOrder
* @see UBiDiMirroring
* @see ubidi_setPara
* @see u_shapeArabic
* @stable ICU 58
*/
U_CAPI uint32_t U_EXPORT2
ubiditransform_transform(UBiDiTransform *pBiDiTransform,
const UChar *src, int32_t srcLength,
UChar *dest, int32_t destSize,
UBiDiLevel inParaLevel, UBiDiOrder inOrder,
UBiDiLevel outParaLevel, UBiDiOrder outOrder,
UBiDiMirroring doMirroring, uint32_t shapingOptions,
UErrorCode *pErrorCode);
/**
* Allocates a <code>UBiDiTransform</code> object. This object can be reused,
* e.g. with different ordering schemes, mirroring or shaping options.<p>
* <strong>Note:</strong>The object can only be reused in the same thread.
* All other threads should allocate a new <code>UBiDiTransform</code> object
* before using it.<p>
* Example of usage:<p>
* <pre>
* \code
* UErrorCode errorCode = U_ZERO_ERROR;
* // Open a new UBiDiTransform.
* UBiDiTransform* transform = ubiditransform_open(&errorCode);
* // Run a transformation.
* ubiditransform_transform(transform,
* text1, -1, text2, -1,
* UBIDI_RTL, UBIDI_LOGICAL,
* UBIDI_LTR, UBIDI_VISUAL,
* UBIDI_MIRRORING_ON,
* U_SHAPE_DIGITS_EN2AN,
* &errorCode);
* // Do something with the output text and invoke another transformation using
* // that text as input.
* ubiditransform_transform(transform,
* text2, -1, text3, -1,
* UBIDI_LTR, UBIDI_VISUAL,
* UBIDI_RTL, UBIDI_VISUAL,
* UBIDI_MIRRORING_ON,
* 0, &errorCode);
*\endcode
* </pre>
* <p>
* The <code>UBiDiTransform</code> object must be deallocated by calling
* <code>ubiditransform_close()</code>.
*
* @return An empty <code>UBiDiTransform</code> object.
* @stable ICU 58
*/
U_CAPI UBiDiTransform* U_EXPORT2
ubiditransform_open(UErrorCode *pErrorCode);
/**
* Deallocates the given <code>UBiDiTransform</code> object.
* @stable ICU 58
*/
U_CAPI void U_EXPORT2
ubiditransform_close(UBiDiTransform *pBidiTransform);
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalUBiDiTransformPointer
* "Smart pointer" class, closes a UBiDiTransform via ubiditransform_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @stable ICU 58
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalUBiDiTransformPointer, UBiDiTransform, ubiditransform_close);
U_NAMESPACE_END
#endif
#endif

647
thirdparty/icu4c/common/unicode/ubrk.h vendored Normal file
View File

@@ -0,0 +1,647 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
* Copyright (C) 1996-2015, International Business Machines Corporation and others.
* All Rights Reserved.
******************************************************************************
*/
#ifndef UBRK_H
#define UBRK_H
#include "unicode/utypes.h"
#include "unicode/uloc.h"
#include "unicode/utext.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/localpointer.h"
#endif // U_SHOW_CPLUSPLUS_API
/**
* A text-break iterator.
* For usage in C programs.
*/
#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
# define UBRK_TYPEDEF_UBREAK_ITERATOR
/**
* Opaque type representing an ICU Break iterator object.
* @stable ICU 2.0
*/
typedef struct UBreakIterator UBreakIterator;
#endif
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/parseerr.h"
/**
* \file
* \brief C API: BreakIterator
*
* <h2> BreakIterator C API </h2>
*
* The BreakIterator C API defines methods for finding the location
* of boundaries in text. Pointer to a UBreakIterator maintain a
* current position and scan over text returning the index of characters
* where boundaries occur.
* <p>
* Line boundary analysis determines where a text string can be broken
* when line-wrapping. The mechanism correctly handles punctuation and
* hyphenated words.
* <p>
* Note: The locale keyword "lb" can be used to modify line break
* behavior according to the CSS level 3 line-break options, see
* <http://dev.w3.org/csswg/css-text/#line-breaking>. For example:
* "ja@lb=strict", "zh@lb=loose".
* <p>
* Sentence boundary analysis allows selection with correct
* interpretation of periods within numbers and abbreviations, and
* trailing punctuation marks such as quotation marks and parentheses.
* <p>
* Note: The locale keyword "ss" can be used to enable use of
* segmentation suppression data (preventing breaks in English after
* abbreviations such as "Mr." or "Est.", for example), as follows:
* "en@ss=standard".
* <p>
* Word boundary analysis is used by search and replace functions, as
* well as within text editing applications that allow the user to
* select words with a double click. Word selection provides correct
* interpretation of punctuation marks within and following
* words. Characters that are not part of a word, such as symbols or
* punctuation marks, have word-breaks on both sides.
* <p>
* Character boundary analysis identifies the boundaries of
* "Extended Grapheme Clusters", which are groupings of codepoints
* that should be treated as character-like units for many text operations.
* Please see Unicode Standard Annex #29, Unicode Text Segmentation,
* http://www.unicode.org/reports/tr29/ for additional information
* on grapheme clusters and guidelines on their use.
* <p>
* Title boundary analysis locates all positions,
* typically starts of words, that should be set to Title Case
* when title casing the text.
* <p>
* The text boundary positions are found according to the rules
* described in Unicode Standard Annex #29, Text Boundaries, and
* Unicode Standard Annex #14, Line Breaking Properties. These
* are available at http://www.unicode.org/reports/tr14/ and
* http://www.unicode.org/reports/tr29/.
* <p>
* In addition to the plain C API defined in this header file, an
* object oriented C++ API with equivalent functionality is defined in the
* file brkiter.h.
* <p>
* Code snippets illustrating the use of the Break Iterator APIs
* are available in the ICU User Guide,
* https://unicode-org.github.io/icu/userguide/boundaryanalysis/
* and in the sample program icu/source/samples/break/break.cpp
*/
/** The possible types of text boundaries. @stable ICU 2.0 */
typedef enum UBreakIteratorType {
/** Character breaks @stable ICU 2.0 */
UBRK_CHARACTER = 0,
/** Word breaks @stable ICU 2.0 */
UBRK_WORD = 1,
/** Line breaks @stable ICU 2.0 */
UBRK_LINE = 2,
/** Sentence breaks @stable ICU 2.0 */
UBRK_SENTENCE = 3,
#ifndef U_HIDE_DEPRECATED_API
/**
* Title Case breaks
* The iterator created using this type locates title boundaries as described for
* Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
* please use Word Boundary iterator.
*
* @deprecated ICU 2.8 Use the word break iterator for titlecasing for Unicode 4 and later.
*/
UBRK_TITLE = 4,
/**
* One more than the highest normal UBreakIteratorType value.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
UBRK_COUNT = 5
#endif // U_HIDE_DEPRECATED_API
} UBreakIteratorType;
/** Value indicating all text boundaries have been returned.
* @stable ICU 2.0
*/
#define UBRK_DONE ((int32_t) -1)
/**
* Enum constants for the word break tags returned by
* getRuleStatus(). A range of values is defined for each category of
* word, to allow for further subdivisions of a category in future releases.
* Applications should check for tag values falling within the range, rather
* than for single individual values.
*
* The numeric values of all of these constants are stable (will not change).
*
* @stable ICU 2.2
*/
typedef enum UWordBreak {
/** Tag value for "words" that do not fit into any of other categories.
* Includes spaces and most punctuation. */
UBRK_WORD_NONE = 0,
/** Upper bound for tags for uncategorized words. */
UBRK_WORD_NONE_LIMIT = 100,
/** Tag value for words that appear to be numbers, lower limit. */
UBRK_WORD_NUMBER = 100,
/** Tag value for words that appear to be numbers, upper limit. */
UBRK_WORD_NUMBER_LIMIT = 200,
/** Tag value for words that contain letters, excluding
* hiragana, katakana or ideographic characters, lower limit. */
UBRK_WORD_LETTER = 200,
/** Tag value for words containing letters, upper limit */
UBRK_WORD_LETTER_LIMIT = 300,
/** Tag value for words containing kana characters, lower limit */
UBRK_WORD_KANA = 300,
/** Tag value for words containing kana characters, upper limit */
UBRK_WORD_KANA_LIMIT = 400,
/** Tag value for words containing ideographic characters, lower limit */
UBRK_WORD_IDEO = 400,
/** Tag value for words containing ideographic characters, upper limit */
UBRK_WORD_IDEO_LIMIT = 500
} UWordBreak;
/**
* Enum constants for the line break tags returned by getRuleStatus().
* A range of values is defined for each category of
* word, to allow for further subdivisions of a category in future releases.
* Applications should check for tag values falling within the range, rather
* than for single individual values.
*
* The numeric values of all of these constants are stable (will not change).
*
* @stable ICU 2.8
*/
typedef enum ULineBreakTag {
/** Tag value for soft line breaks, positions at which a line break
* is acceptable but not required */
UBRK_LINE_SOFT = 0,
/** Upper bound for soft line breaks. */
UBRK_LINE_SOFT_LIMIT = 100,
/** Tag value for a hard, or mandatory line break */
UBRK_LINE_HARD = 100,
/** Upper bound for hard line breaks. */
UBRK_LINE_HARD_LIMIT = 200
} ULineBreakTag;
/**
* Enum constants for the sentence break tags returned by getRuleStatus().
* A range of values is defined for each category of
* sentence, to allow for further subdivisions of a category in future releases.
* Applications should check for tag values falling within the range, rather
* than for single individual values.
*
* The numeric values of all of these constants are stable (will not change).
*
* @stable ICU 2.8
*/
typedef enum USentenceBreakTag {
/** Tag value for for sentences ending with a sentence terminator
* ('.', '?', '!', etc.) character, possibly followed by a
* hard separator (CR, LF, PS, etc.)
*/
UBRK_SENTENCE_TERM = 0,
/** Upper bound for tags for sentences ended by sentence terminators. */
UBRK_SENTENCE_TERM_LIMIT = 100,
/** Tag value for for sentences that do not contain an ending
* sentence terminator ('.', '?', '!', etc.) character, but
* are ended only by a hard separator (CR, LF, PS, etc.) or end of input.
*/
UBRK_SENTENCE_SEP = 100,
/** Upper bound for tags for sentences ended by a separator. */
UBRK_SENTENCE_SEP_LIMIT = 200
/** Tag value for a hard, or mandatory line break */
} USentenceBreakTag;
/**
* Open a new UBreakIterator for locating text boundaries for a specified locale.
* A UBreakIterator may be used for detecting character, line, word,
* and sentence breaks in text.
* @param type The type of UBreakIterator to open: one of UBRK_CHARACTER, UBRK_WORD,
* UBRK_LINE, UBRK_SENTENCE
* @param locale The locale specifying the text-breaking conventions. Note that
* locale keys such as "lb" and "ss" may be used to modify text break behavior,
* see general discussion of BreakIterator C API.
* @param text The text to be iterated over. May be null, in which case ubrk_setText() is
* used to specify the text to be iterated.
* @param textLength The number of characters in text, or -1 if null-terminated.
* @param status A UErrorCode to receive any errors.
* @return A UBreakIterator for the specified locale.
* @see ubrk_openRules
* @stable ICU 2.0
*/
U_CAPI UBreakIterator* U_EXPORT2
ubrk_open(UBreakIteratorType type,
const char *locale,
const UChar *text,
int32_t textLength,
UErrorCode *status);
/**
* Open a new UBreakIterator for locating text boundaries using specified breaking rules.
* The rule syntax is ... (TBD)
* @param rules A set of rules specifying the text breaking conventions.
* @param rulesLength The number of characters in rules, or -1 if null-terminated.
* @param text The text to be iterated over. May be null, in which case ubrk_setText() is
* used to specify the text to be iterated.
* @param textLength The number of characters in text, or -1 if null-terminated.
* @param parseErr Receives position and context information for any syntax errors
* detected while parsing the rules.
* @param status A UErrorCode to receive any errors.
* @return A UBreakIterator for the specified rules.
* @see ubrk_open
* @stable ICU 2.2
*/
U_CAPI UBreakIterator* U_EXPORT2
ubrk_openRules(const UChar *rules,
int32_t rulesLength,
const UChar *text,
int32_t textLength,
UParseError *parseErr,
UErrorCode *status);
/**
* Open a new UBreakIterator for locating text boundaries using precompiled binary rules.
* Opening a UBreakIterator this way is substantially faster than using ubrk_openRules.
* Binary rules may be obtained using ubrk_getBinaryRules. The compiled rules are not
* compatible across different major versions of ICU, nor across platforms of different
* endianness or different base character set family (ASCII vs EBCDIC).
* @param binaryRules A set of compiled binary rules specifying the text breaking
* conventions. Ownership of the storage containing the compiled
* rules remains with the caller of this function. The compiled
* rules must not be modified or deleted during the life of the
* break iterator.
* @param rulesLength The length of binaryRules in bytes; must be >= 0.
* @param text The text to be iterated over. May be null, in which case
* ubrk_setText() is used to specify the text to be iterated.
* @param textLength The number of characters in text, or -1 if null-terminated.
* @param status Pointer to UErrorCode to receive any errors.
* @return UBreakIterator for the specified rules.
* @see ubrk_getBinaryRules
* @stable ICU 59
*/
U_CAPI UBreakIterator* U_EXPORT2
ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength,
const UChar * text, int32_t textLength,
UErrorCode * status);
#ifndef U_HIDE_DEPRECATED_API
/**
* Thread safe cloning operation
* @param bi iterator to be cloned
* @param stackBuffer <em>Deprecated functionality as of ICU 52, use NULL.</em><br>
* user allocated space for the new clone. If NULL new memory will be allocated.
* If buffer is not large enough, new memory will be allocated.
* Clients can use the U_BRK_SAFECLONE_BUFFERSIZE.
* @param pBufferSize <em>Deprecated functionality as of ICU 52, use NULL or 1.</em><br>
* pointer to size of allocated space.
* If *pBufferSize == 0, a sufficient size for use in cloning will
* be returned ('pre-flighting')
* If *pBufferSize is not enough for a stack-based safe clone,
* new memory will be allocated.
* @param status to indicate whether the operation went on smoothly or there were errors
* An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used
* if pBufferSize != NULL and any allocations were necessary
* @return pointer to the new clone
* @deprecated ICU 69 Use ubrk_clone() instead.
*/
U_DEPRECATED UBreakIterator * U_EXPORT2
ubrk_safeClone(
const UBreakIterator *bi,
void *stackBuffer,
int32_t *pBufferSize,
UErrorCode *status);
#endif /* U_HIDE_DEPRECATED_API */
/**
* Thread safe cloning operation.
* @param bi iterator to be cloned
* @param status to indicate whether the operation went on smoothly or there were errors
* @return pointer to the new clone
* @stable ICU 69
*/
U_CAPI UBreakIterator * U_EXPORT2
ubrk_clone(const UBreakIterator *bi,
UErrorCode *status);
#ifndef U_HIDE_DEPRECATED_API
/**
* A recommended size (in bytes) for the memory buffer to be passed to ubrk_saveClone().
* @deprecated ICU 52. Do not rely on ubrk_safeClone() cloning into any provided buffer.
*/
#define U_BRK_SAFECLONE_BUFFERSIZE 1
#endif /* U_HIDE_DEPRECATED_API */
/**
* Close a UBreakIterator.
* Once closed, a UBreakIterator may no longer be used.
* @param bi The break iterator to close.
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2
ubrk_close(UBreakIterator *bi);
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalUBreakIteratorPointer
* "Smart pointer" class, closes a UBreakIterator via ubrk_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @stable ICU 4.4
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalUBreakIteratorPointer, UBreakIterator, ubrk_close);
U_NAMESPACE_END
#endif
/**
* Sets an existing iterator to point to a new piece of text.
* The break iterator retains a pointer to the supplied text.
* The caller must not modify or delete the text while the BreakIterator
* retains the reference.
*
* @param bi The iterator to use
* @param text The text to be set
* @param textLength The length of the text
* @param status The error code
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2
ubrk_setText(UBreakIterator* bi,
const UChar* text,
int32_t textLength,
UErrorCode* status);
/**
* Sets an existing iterator to point to a new piece of text.
*
* All index positions returned by break iterator functions are
* native indices from the UText. For example, when breaking UTF-8
* encoded text, the break positions returned by \ref ubrk_next, \ref ubrk_previous, etc.
* will be UTF-8 string indices, not UTF-16 positions.
*
* @param bi The iterator to use
* @param text The text to be set.
* This function makes a shallow clone of the supplied UText. This means
* that the caller is free to immediately close or otherwise reuse the
* UText that was passed as a parameter, but that the underlying text itself
* must not be altered while being referenced by the break iterator.
* @param status The error code
* @stable ICU 3.4
*/
U_CAPI void U_EXPORT2
ubrk_setUText(UBreakIterator* bi,
UText* text,
UErrorCode* status);
/**
* Determine the most recently-returned text boundary.
*
* @param bi The break iterator to use.
* @return The character index most recently returned by \ref ubrk_next, \ref ubrk_previous,
* \ref ubrk_first, or \ref ubrk_last.
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_current(const UBreakIterator *bi);
/**
* Advance the iterator to the boundary following the current boundary.
*
* @param bi The break iterator to use.
* @return The character index of the next text boundary, or UBRK_DONE
* if all text boundaries have been returned.
* @see ubrk_previous
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_next(UBreakIterator *bi);
/**
* Set the iterator position to the boundary preceding the current boundary.
*
* @param bi The break iterator to use.
* @return The character index of the preceding text boundary, or UBRK_DONE
* if all text boundaries have been returned.
* @see ubrk_next
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_previous(UBreakIterator *bi);
/**
* Set the iterator position to zero, the start of the text being scanned.
* @param bi The break iterator to use.
* @return The new iterator position (zero).
* @see ubrk_last
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_first(UBreakIterator *bi);
/**
* Set the iterator position to the index immediately <EM>beyond</EM> the last character in the text being scanned.
* This is not the same as the last character.
* @param bi The break iterator to use.
* @return The character offset immediately <EM>beyond</EM> the last character in the
* text being scanned.
* @see ubrk_first
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_last(UBreakIterator *bi);
/**
* Set the iterator position to the first boundary preceding the specified offset.
* The new position is always smaller than offset, or UBRK_DONE.
* @param bi The break iterator to use.
* @param offset The offset to begin scanning.
* @return The text boundary preceding offset, or UBRK_DONE.
* @see ubrk_following
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_preceding(UBreakIterator *bi,
int32_t offset);
/**
* Advance the iterator to the first boundary following the specified offset.
* The value returned is always greater than offset, or UBRK_DONE.
* @param bi The break iterator to use.
* @param offset The offset to begin scanning.
* @return The text boundary following offset, or UBRK_DONE.
* @see ubrk_preceding
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_following(UBreakIterator *bi,
int32_t offset);
/**
* Get a locale for which text breaking information is available.
* A UBreakIterator in a locale returned by this function will perform the correct
* text breaking for the locale.
* @param index The index of the desired locale.
* @return A locale for which number text breaking information is available, or 0 if none.
* @see ubrk_countAvailable
* @stable ICU 2.0
*/
U_CAPI const char* U_EXPORT2
ubrk_getAvailable(int32_t index);
/**
* Determine how many locales have text breaking information available.
* This function is most useful as determining the loop ending condition for
* calls to \ref ubrk_getAvailable.
* @return The number of locales for which text breaking information is available.
* @see ubrk_getAvailable
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_countAvailable(void);
/**
* Returns true if the specified position is a boundary position. As a side
* effect, leaves the iterator pointing to the first boundary position at
* or after "offset".
* @param bi The break iterator to use.
* @param offset the offset to check.
* @return True if "offset" is a boundary position.
* @stable ICU 2.0
*/
U_CAPI UBool U_EXPORT2
ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
/**
* Return the status from the break rule that determined the most recently
* returned break position. The values appear in the rule source
* within brackets, {123}, for example. For rules that do not specify a
* status, a default value of 0 is returned.
* <p>
* For word break iterators, the possible values are defined in enum UWordBreak.
* @stable ICU 2.2
*/
U_CAPI int32_t U_EXPORT2
ubrk_getRuleStatus(UBreakIterator *bi);
/**
* Get the statuses from the break rules that determined the most recently
* returned break position. The values appear in the rule source
* within brackets, {123}, for example. The default status value for rules
* that do not explicitly provide one is zero.
* <p>
* For word break iterators, the possible values are defined in enum UWordBreak.
* @param bi The break iterator to use
* @param fillInVec an array to be filled in with the status values.
* @param capacity the length of the supplied vector. A length of zero causes
* the function to return the number of status values, in the
* normal way, without attempting to store any values.
* @param status receives error codes.
* @return The number of rule status values from rules that determined
* the most recent boundary returned by the break iterator.
* @stable ICU 3.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status);
/**
* Return the locale of the break iterator. You can choose between the valid and
* the actual locale.
* @param bi break iterator
* @param type locale type (valid or actual)
* @param status error code
* @return locale string
* @stable ICU 2.8
*/
U_CAPI const char* U_EXPORT2
ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);
/**
* Set the subject text string upon which the break iterator is operating
* without changing any other aspect of the state.
* The new and previous text strings must have the same content.
*
* This function is intended for use in environments where ICU is operating on
* strings that may move around in memory. It provides a mechanism for notifying
* ICU that the string has been relocated, and providing a new UText to access the
* string in its new position.
*
* Note that the break iterator never copies the underlying text
* of a string being processed, but always operates directly on the original text
* provided by the user. Refreshing simply drops the references to the old text
* and replaces them with references to the new.
*
* Caution: this function is normally used only by very specialized
* system-level code. One example use case is with garbage collection
* that moves the text in memory.
*
* @param bi The break iterator.
* @param text The new (moved) text string.
* @param status Receives errors detected by this function.
*
* @stable ICU 49
*/
U_CAPI void U_EXPORT2
ubrk_refreshUText(UBreakIterator *bi,
UText *text,
UErrorCode *status);
/**
* Get a compiled binary version of the rules specifying the behavior of a UBreakIterator.
* The binary rules may be used with ubrk_openBinaryRules to open a new UBreakIterator
* more quickly than using ubrk_openRules. The compiled rules are not compatible across
* different major versions of ICU, nor across platforms of different endianness or
* different base character set family (ASCII vs EBCDIC). Supports preflighting (with
* binaryRules=NULL and rulesCapacity=0) to get the rules length without copying them to
* the binaryRules buffer. However, whether preflighting or not, if the actual length
* is greater than INT32_MAX, then the function returns 0 and sets *status to
* U_INDEX_OUTOFBOUNDS_ERROR.
* @param bi The break iterator to use.
* @param binaryRules Buffer to receive the compiled binary rules; set to NULL for
* preflighting.
* @param rulesCapacity Capacity (in bytes) of the binaryRules buffer; set to 0 for
* preflighting. Must be >= 0.
* @param status Pointer to UErrorCode to receive any errors, such as
* U_BUFFER_OVERFLOW_ERROR, U_INDEX_OUTOFBOUNDS_ERROR, or
* U_ILLEGAL_ARGUMENT_ERROR.
* @return The actual byte length of the binary rules, if <= INT32_MAX;
* otherwise 0. If not preflighting and this is larger than
* rulesCapacity, *status will be set to an error.
* @see ubrk_openBinaryRules
* @stable ICU 59
*/
U_CAPI int32_t U_EXPORT2
ubrk_getBinaryRules(UBreakIterator *bi,
uint8_t * binaryRules, int32_t rulesCapacity,
UErrorCode * status);
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif

View File

@@ -0,0 +1,388 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2005-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: ucasemap.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2005may06
* created by: Markus W. Scherer
*
* Case mapping service object and functions using it.
*/
#ifndef __UCASEMAP_H__
#define __UCASEMAP_H__
#include "unicode/utypes.h"
#include "unicode/stringoptions.h"
#include "unicode/ustring.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/localpointer.h"
#endif // U_SHOW_CPLUSPLUS_API
/**
* \file
* \brief C API: Unicode case mapping functions using a UCaseMap service object.
*
* The service object takes care of memory allocations, data loading, and setup
* for the attributes, as usual.
*
* Currently, the functionality provided here does not overlap with uchar.h
* and ustring.h, except for ucasemap_toTitle().
*
* ucasemap_utf8XYZ() functions operate directly on UTF-8 strings.
*/
/**
* UCaseMap is an opaque service object for newer ICU case mapping functions.
* Older functions did not use a service object.
* @stable ICU 3.4
*/
struct UCaseMap;
typedef struct UCaseMap UCaseMap; /**< C typedef for struct UCaseMap. @stable ICU 3.4 */
/**
* Open a UCaseMap service object for a locale and a set of options.
* The locale ID and options are preprocessed so that functions using the
* service object need not process them in each call.
*
* @param locale ICU locale ID, used for language-dependent
* upper-/lower-/title-casing according to the Unicode standard.
* Usual semantics: ""=root, NULL=default locale, etc.
* @param options Options bit set, used for case folding and string comparisons.
* Same flags as for u_foldCase(), u_strFoldCase(),
* u_strCaseCompare(), etc.
* Use 0 or U_FOLD_CASE_DEFAULT for default behavior.
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
* @return Pointer to a UCaseMap service object, if successful.
*
* @see U_FOLD_CASE_DEFAULT
* @see U_FOLD_CASE_EXCLUDE_SPECIAL_I
* @see U_TITLECASE_NO_LOWERCASE
* @see U_TITLECASE_NO_BREAK_ADJUSTMENT
* @stable ICU 3.4
*/
U_CAPI UCaseMap * U_EXPORT2
ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode);
/**
* Close a UCaseMap service object.
* @param csm Object to be closed.
* @stable ICU 3.4
*/
U_CAPI void U_EXPORT2
ucasemap_close(UCaseMap *csm);
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalUCaseMapPointer
* "Smart pointer" class, closes a UCaseMap via ucasemap_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @stable ICU 4.4
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalUCaseMapPointer, UCaseMap, ucasemap_close);
U_NAMESPACE_END
#endif
/**
* Get the locale ID that is used for language-dependent case mappings.
* @param csm UCaseMap service object.
* @return locale ID
* @stable ICU 3.4
*/
U_CAPI const char * U_EXPORT2
ucasemap_getLocale(const UCaseMap *csm);
/**
* Get the options bit set that is used for case folding and string comparisons.
* @param csm UCaseMap service object.
* @return options bit set
* @stable ICU 3.4
*/
U_CAPI uint32_t U_EXPORT2
ucasemap_getOptions(const UCaseMap *csm);
/**
* Set the locale ID that is used for language-dependent case mappings.
*
* @param csm UCaseMap service object.
* @param locale Locale ID, see ucasemap_open().
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
*
* @see ucasemap_open
* @stable ICU 3.4
*/
U_CAPI void U_EXPORT2
ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode);
/**
* Set the options bit set that is used for case folding and string comparisons.
*
* @param csm UCaseMap service object.
* @param options Options bit set, see ucasemap_open().
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
*
* @see ucasemap_open
* @stable ICU 3.4
*/
U_CAPI void U_EXPORT2
ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode);
#if !UCONFIG_NO_BREAK_ITERATION
/**
* Get the break iterator that is used for titlecasing.
* Do not modify the returned break iterator.
* @param csm UCaseMap service object.
* @return titlecasing break iterator
* @stable ICU 3.8
*/
U_CAPI const UBreakIterator * U_EXPORT2
ucasemap_getBreakIterator(const UCaseMap *csm);
/**
* Set the break iterator that is used for titlecasing.
* The UCaseMap service object releases a previously set break iterator
* and "adopts" this new one, taking ownership of it.
* It will be released in a subsequent call to ucasemap_setBreakIterator()
* or ucasemap_close().
*
* Break iterator operations are not thread-safe. Therefore, titlecasing
* functions use non-const UCaseMap objects. It is not possible to titlecase
* strings concurrently using the same UCaseMap.
*
* @param csm UCaseMap service object.
* @param iterToAdopt Break iterator to be adopted for titlecasing.
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
*
* @see ucasemap_toTitle
* @see ucasemap_utf8ToTitle
* @stable ICU 3.8
*/
U_CAPI void U_EXPORT2
ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode *pErrorCode);
/**
* Titlecase a UTF-16 string. This function is almost a duplicate of u_strToTitle(),
* except that it takes ucasemap_setOptions() into account and has performance
* advantages from being able to use a UCaseMap object for multiple case mapping
* operations, saving setup time.
*
* Casing is locale-dependent and context-sensitive.
* Titlecasing uses a break iterator to find the first characters of words
* that are to be titlecased. It titlecases those characters and lowercases
* all others. (This can be modified with ucasemap_setOptions().)
*
* Note: This function takes a non-const UCaseMap pointer because it will
* open a default break iterator if no break iterator was set yet,
* and effectively call ucasemap_setBreakIterator();
* also because the break iterator is stateful and will be modified during
* the iteration.
*
* The titlecase break iterator can be provided to customize for arbitrary
* styles, using rules and dictionaries beyond the standard iterators.
* If the break iterator passed in is null, the default Unicode algorithm
* will be used to determine the titlecase positions.
*
* This function uses only the setText(), first() and next() methods of the
* provided break iterator.
*
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* @param csm UCaseMap service object. This pointer is non-const!
* See the note above for details.
* @param dest A buffer for the result string. The result will be NUL-terminated if
* the buffer is large enough.
* The contents is undefined in case of failure.
* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
* dest may be NULL and the function will only return the length of the result
* without writing any of the result string.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful - or in case of a buffer overflow,
* in which case it will be greater than destCapacity.
*
* @see u_strToTitle
* @stable ICU 3.8
*/
U_CAPI int32_t U_EXPORT2
ucasemap_toTitle(UCaseMap *csm,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UErrorCode *pErrorCode);
#endif // UCONFIG_NO_BREAK_ITERATION
/**
* Lowercase the characters in a UTF-8 string.
* Casing is locale-dependent and context-sensitive.
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* @param csm UCaseMap service object.
* @param dest A buffer for the result string. The result will be NUL-terminated if
* the buffer is large enough.
* The contents is undefined in case of failure.
* @param destCapacity The size of the buffer (number of bytes). If it is 0, then
* dest may be NULL and the function will only return the length of the result
* without writing any of the result string.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful - or in case of a buffer overflow,
* in which case it will be greater than destCapacity.
*
* @see u_strToLower
* @stable ICU 3.4
*/
U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToLower(const UCaseMap *csm,
char *dest, int32_t destCapacity,
const char *src, int32_t srcLength,
UErrorCode *pErrorCode);
/**
* Uppercase the characters in a UTF-8 string.
* Casing is locale-dependent and context-sensitive.
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* @param csm UCaseMap service object.
* @param dest A buffer for the result string. The result will be NUL-terminated if
* the buffer is large enough.
* The contents is undefined in case of failure.
* @param destCapacity The size of the buffer (number of bytes). If it is 0, then
* dest may be NULL and the function will only return the length of the result
* without writing any of the result string.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful - or in case of a buffer overflow,
* in which case it will be greater than destCapacity.
*
* @see u_strToUpper
* @stable ICU 3.4
*/
U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToUpper(const UCaseMap *csm,
char *dest, int32_t destCapacity,
const char *src, int32_t srcLength,
UErrorCode *pErrorCode);
#if !UCONFIG_NO_BREAK_ITERATION
/**
* Titlecase a UTF-8 string.
* Casing is locale-dependent and context-sensitive.
* Titlecasing uses a break iterator to find the first characters of words
* that are to be titlecased. It titlecases those characters and lowercases
* all others. (This can be modified with ucasemap_setOptions().)
*
* Note: This function takes a non-const UCaseMap pointer because it will
* open a default break iterator if no break iterator was set yet,
* and effectively call ucasemap_setBreakIterator();
* also because the break iterator is stateful and will be modified during
* the iteration.
*
* The titlecase break iterator can be provided to customize for arbitrary
* styles, using rules and dictionaries beyond the standard iterators.
* If the break iterator passed in is null, the default Unicode algorithm
* will be used to determine the titlecase positions.
*
* This function uses only the setUText(), first(), next() and close() methods of the
* provided break iterator.
*
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* @param csm UCaseMap service object. This pointer is non-const!
* See the note above for details.
* @param dest A buffer for the result string. The result will be NUL-terminated if
* the buffer is large enough.
* The contents is undefined in case of failure.
* @param destCapacity The size of the buffer (number of bytes). If it is 0, then
* dest may be NULL and the function will only return the length of the result
* without writing any of the result string.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful - or in case of a buffer overflow,
* in which case it will be greater than destCapacity.
*
* @see u_strToTitle
* @see U_TITLECASE_NO_LOWERCASE
* @see U_TITLECASE_NO_BREAK_ADJUSTMENT
* @stable ICU 3.8
*/
U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToTitle(UCaseMap *csm,
char *dest, int32_t destCapacity,
const char *src, int32_t srcLength,
UErrorCode *pErrorCode);
#endif
/**
* Case-folds the characters in a UTF-8 string.
*
* Case-folding is locale-independent and not context-sensitive,
* but there is an option for whether to include or exclude mappings for dotted I
* and dotless i that are marked with 'T' in CaseFolding.txt.
*
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* @param csm UCaseMap service object.
* @param dest A buffer for the result string. The result will be NUL-terminated if
* the buffer is large enough.
* The contents is undefined in case of failure.
* @param destCapacity The size of the buffer (number of bytes). If it is 0, then
* dest may be NULL and the function will only return the length of the result
* without writing any of the result string.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful - or in case of a buffer overflow,
* in which case it will be greater than destCapacity.
*
* @see u_strFoldCase
* @see ucasemap_setOptions
* @see U_FOLD_CASE_DEFAULT
* @see U_FOLD_CASE_EXCLUDE_SPECIAL_I
* @stable ICU 3.8
*/
U_CAPI int32_t U_EXPORT2
ucasemap_utf8FoldCase(const UCaseMap *csm,
char *dest, int32_t destCapacity,
const char *src, int32_t srcLength,
UErrorCode *pErrorCode);
#endif

160
thirdparty/icu4c/common/unicode/ucat.h vendored Normal file
View File

@@ -0,0 +1,160 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (c) 2003-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: March 19 2003
* Since: ICU 2.6
**********************************************************************
*/
#ifndef UCAT_H
#define UCAT_H
#include "unicode/utypes.h"
#include "unicode/ures.h"
/**
* \file
* \brief C API: Message Catalog Wrappers
*
* This C API provides look-alike functions that deliberately resemble
* the POSIX catopen, catclose, and catgets functions. The underlying
* implementation is in terms of ICU resource bundles, rather than
* POSIX message catalogs.
*
* The ICU resource bundles obey standard ICU inheritance policies.
* To facilitate this, sets and messages are flattened into one tier.
* This is done by creating resource bundle keys of the form
* &lt;set_num&gt;%&lt;msg_num&gt; where set_num is the set number and msg_num is
* the message number, formatted as decimal strings.
*
* Example: Consider a message catalog containing two sets:
*
* Set 1: Message 4 = "Good morning."
* Message 5 = "Good afternoon."
* Message 7 = "Good evening."
* Message 8 = "Good night."
* Set 4: Message 14 = "Please "
* Message 19 = "Thank you."
* Message 20 = "Sincerely,"
*
* The ICU resource bundle source file would, assuming it is named
* "greet.txt", would look like this:
*
* greet
* {
* 1%4 { "Good morning." }
* 1%5 { "Good afternoon." }
* 1%7 { "Good evening." }
* 1%8 { "Good night." }
*
* 4%14 { "Please " }
* 4%19 { "Thank you." }
* 4%20 { "Sincerely," }
* }
*
* The catgets function is commonly used in combination with functions
* like printf and strftime. ICU components like message format can
* be used instead, although they use a different format syntax.
* There is an ICU package, icuio, that provides some of
* the POSIX-style formatting API.
*/
U_CDECL_BEGIN
/**
* An ICU message catalog descriptor, analogous to nl_catd.
*
* @stable ICU 2.6
*/
typedef UResourceBundle* u_nl_catd;
/**
* Open and return an ICU message catalog descriptor. The descriptor
* may be passed to u_catgets() to retrieve localized strings.
*
* @param name string containing the full path pointing to the
* directory where the resources reside followed by the package name
* e.g. "/usr/resource/my_app/resources/guimessages" on a Unix system.
* If NULL, ICU default data files will be used.
*
* Unlike POSIX, environment variables are not interpolated within the
* name.
*
* @param locale the locale for which we want to open the resource. If
* NULL, the default ICU locale will be used (see uloc_getDefault). If
* strlen(locale) == 0, the root locale will be used.
*
* @param ec input/output error code. Upon output,
* U_USING_FALLBACK_WARNING indicates that a fallback locale was
* used. For example, 'de_CH' was requested, but nothing was found
* there, so 'de' was used. U_USING_DEFAULT_WARNING indicates that the
* default locale data or root locale data was used; neither the
* requested locale nor any of its fallback locales were found.
*
* @return a message catalog descriptor that may be passed to
* u_catgets(). If the ec parameter indicates success, then the caller
* is responsible for calling u_catclose() to close the message
* catalog. If the ec parameter indicates failure, then NULL will be
* returned.
*
* @stable ICU 2.6
*/
U_CAPI u_nl_catd U_EXPORT2
u_catopen(const char* name, const char* locale, UErrorCode* ec);
/**
* Close an ICU message catalog, given its descriptor.
*
* @param catd a message catalog descriptor to be closed. May be NULL,
* in which case no action is taken.
*
* @stable ICU 2.6
*/
U_CAPI void U_EXPORT2
u_catclose(u_nl_catd catd);
/**
* Retrieve a localized string from an ICU message catalog.
*
* @param catd a message catalog descriptor returned by u_catopen.
*
* @param set_num the message catalog set number. Sets need not be
* numbered consecutively.
*
* @param msg_num the message catalog message number within the
* set. Messages need not be numbered consecutively.
*
* @param s the default string. This is returned if the string
* specified by the set_num and msg_num is not found. It must be
* zero-terminated.
*
* @param len fill-in parameter to receive the length of the result.
* May be NULL, in which case it is ignored.
*
* @param ec input/output error code. May be U_USING_FALLBACK_WARNING
* or U_USING_DEFAULT_WARNING. U_MISSING_RESOURCE_ERROR indicates that
* the set_num/msg_num tuple does not specify a valid message string
* in this catalog.
*
* @return a pointer to a zero-terminated UChar array which lives in
* an internal buffer area, typically a memory mapped/DLL file. The
* caller must NOT delete this pointer. If the call is unsuccessful
* for any reason, then s is returned. This includes the situation in
* which ec indicates a failing error code upon entry to this
* function.
*
* @stable ICU 2.6
*/
U_CAPI const UChar* U_EXPORT2
u_catgets(u_nl_catd catd, int32_t set_num, int32_t msg_num,
const UChar* s,
int32_t* len, UErrorCode* ec);
U_CDECL_END
#endif /*UCAT_H*/
/*eof*/

4404
thirdparty/icu4c/common/unicode/uchar.h vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,623 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2010-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: ucharstrie.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010nov14
* created by: Markus W. Scherer
*/
#ifndef __UCHARSTRIE_H__
#define __UCHARSTRIE_H__
/**
* \file
* \brief C++ API: Trie for mapping Unicode strings (or 16-bit-unit sequences)
* to integer values.
*/
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/unistr.h"
#include "unicode/uobject.h"
#include "unicode/ustringtrie.h"
U_NAMESPACE_BEGIN
class Appendable;
class UCharsTrieBuilder;
class UVector32;
/**
* Light-weight, non-const reader class for a UCharsTrie.
* Traverses a char16_t-serialized data structure with minimal state,
* for mapping strings (16-bit-unit sequences) to non-negative integer values.
*
* This class owns the serialized trie data only if it was constructed by
* the builder's build() method.
* The public constructor and the copy constructor only alias the data (only copy the pointer).
* There is no assignment operator.
*
* This class is not intended for public subclassing.
* @stable ICU 4.8
*/
class U_COMMON_API UCharsTrie : public UMemory {
public:
/**
* Constructs a UCharsTrie reader instance.
*
* The trieUChars must contain a copy of a char16_t sequence from the UCharsTrieBuilder,
* starting with the first char16_t of that sequence.
* The UCharsTrie object will not read more char16_ts than
* the UCharsTrieBuilder generated in the corresponding build() call.
*
* The array is not copied/cloned and must not be modified while
* the UCharsTrie object is in use.
*
* @param trieUChars The char16_t array that contains the serialized trie.
* @stable ICU 4.8
*/
UCharsTrie(ConstChar16Ptr trieUChars)
: ownedArray_(nullptr), uchars_(trieUChars),
pos_(uchars_), remainingMatchLength_(-1) {}
/**
* Destructor.
* @stable ICU 4.8
*/
~UCharsTrie();
/**
* Copy constructor, copies the other trie reader object and its state,
* but not the char16_t array which will be shared. (Shallow copy.)
* @param other Another UCharsTrie object.
* @stable ICU 4.8
*/
UCharsTrie(const UCharsTrie &other)
: ownedArray_(nullptr), uchars_(other.uchars_),
pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
/**
* Resets this trie to its initial state.
* @return *this
* @stable ICU 4.8
*/
UCharsTrie &reset() {
pos_=uchars_;
remainingMatchLength_=-1;
return *this;
}
/**
* Returns the state of this trie as a 64-bit integer.
* The state value is never 0.
*
* @return opaque state value
* @see resetToState64
* @stable ICU 65
*/
uint64_t getState64() const {
return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) |
static_cast<uint64_t>(pos_ - uchars_);
}
/**
* Resets this trie to the saved state.
* Unlike resetToState(State), the 64-bit state value
* must be from getState64() from the same trie object or
* from one initialized the exact same way.
* Because of no validation, this method is faster.
*
* @param state The opaque trie state value from getState64().
* @return *this
* @see getState64
* @see resetToState
* @see reset
* @stable ICU 65
*/
UCharsTrie &resetToState64(uint64_t state) {
remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2;
pos_ = uchars_ + (state & kState64PosMask);
return *this;
}
/**
* UCharsTrie state object, for saving a trie's current state
* and resetting the trie back to this state later.
* @stable ICU 4.8
*/
class State : public UMemory {
public:
/**
* Constructs an empty State.
* @stable ICU 4.8
*/
State() { uchars=nullptr; }
private:
friend class UCharsTrie;
const char16_t *uchars;
const char16_t *pos;
int32_t remainingMatchLength;
};
/**
* Saves the state of this trie.
* @param state The State object to hold the trie's state.
* @return *this
* @see resetToState
* @stable ICU 4.8
*/
const UCharsTrie &saveState(State &state) const {
state.uchars=uchars_;
state.pos=pos_;
state.remainingMatchLength=remainingMatchLength_;
return *this;
}
/**
* Resets this trie to the saved state.
* If the state object contains no state, or the state of a different trie,
* then this trie remains unchanged.
* @param state The State object which holds a saved trie state.
* @return *this
* @see saveState
* @see reset
* @stable ICU 4.8
*/
UCharsTrie &resetToState(const State &state) {
if(uchars_==state.uchars && uchars_!=nullptr) {
pos_=state.pos;
remainingMatchLength_=state.remainingMatchLength;
}
return *this;
}
/**
* Determines whether the string so far matches, whether it has a value,
* and whether another input char16_t can continue a matching string.
* @return The match/value Result.
* @stable ICU 4.8
*/
UStringTrieResult current() const;
/**
* Traverses the trie from the initial state for this input char16_t.
* Equivalent to reset().next(uchar).
* @param uchar Input char value. Values below 0 and above 0xffff will never match.
* @return The match/value Result.
* @stable ICU 4.8
*/
inline UStringTrieResult first(int32_t uchar) {
remainingMatchLength_=-1;
return nextImpl(uchars_, uchar);
}
/**
* Traverses the trie from the initial state for the
* one or two UTF-16 code units for this input code point.
* Equivalent to reset().nextForCodePoint(cp).
* @param cp A Unicode code point 0..0x10ffff.
* @return The match/value Result.
* @stable ICU 4.8
*/
UStringTrieResult firstForCodePoint(UChar32 cp);
/**
* Traverses the trie from the current state for this input char16_t.
* @param uchar Input char value. Values below 0 and above 0xffff will never match.
* @return The match/value Result.
* @stable ICU 4.8
*/
UStringTrieResult next(int32_t uchar);
/**
* Traverses the trie from the current state for the
* one or two UTF-16 code units for this input code point.
* @param cp A Unicode code point 0..0x10ffff.
* @return The match/value Result.
* @stable ICU 4.8
*/
UStringTrieResult nextForCodePoint(UChar32 cp);
/**
* Traverses the trie from the current state for this string.
* Equivalent to
* \code
* Result result=current();
* for(each c in s)
* if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH;
* result=next(c);
* return result;
* \endcode
* @param s A string. Can be nullptr if length is 0.
* @param length The length of the string. Can be -1 if NUL-terminated.
* @return The match/value Result.
* @stable ICU 4.8
*/
UStringTrieResult next(ConstChar16Ptr s, int32_t length);
/**
* Returns a matching string's value if called immediately after
* current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE.
* getValue() can be called multiple times.
*
* Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE!
* @return The value for the string so far.
* @stable ICU 4.8
*/
inline int32_t getValue() const {
const char16_t *pos=pos_;
int32_t leadUnit=*pos++;
// U_ASSERT(leadUnit>=kMinValueLead);
return leadUnit&kValueIsFinal ?
readValue(pos, leadUnit&0x7fff) : readNodeValue(pos, leadUnit);
}
/**
* Determines whether all strings reachable from the current state
* map to the same value.
* @param uniqueValue Receives the unique value, if this function returns true.
* (output-only)
* @return true if all strings reachable from the current state
* map to the same value.
* @stable ICU 4.8
*/
inline UBool hasUniqueValue(int32_t &uniqueValue) const {
const char16_t *pos=pos_;
// Skip the rest of a pending linear-match node.
return pos!=nullptr && findUniqueValue(pos+remainingMatchLength_+1, false, uniqueValue);
}
/**
* Finds each char16_t which continues the string from the current state.
* That is, each char16_t c for which it would be next(c)!=USTRINGTRIE_NO_MATCH now.
* @param out Each next char16_t is appended to this object.
* @return the number of char16_ts which continue the string from here
* @stable ICU 4.8
*/
int32_t getNextUChars(Appendable &out) const;
/**
* Iterator for all of the (string, value) pairs in a UCharsTrie.
* @stable ICU 4.8
*/
class U_COMMON_API Iterator : public UMemory {
public:
/**
* Iterates from the root of a char16_t-serialized UCharsTrie.
* @param trieUChars The trie char16_ts.
* @param maxStringLength If 0, the iterator returns full strings.
* Otherwise, the iterator returns strings with this maximum length.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @stable ICU 4.8
*/
Iterator(ConstChar16Ptr trieUChars, int32_t maxStringLength, UErrorCode &errorCode);
/**
* Iterates from the current state of the specified UCharsTrie.
* @param trie The trie whose state will be copied for iteration.
* @param maxStringLength If 0, the iterator returns full strings.
* Otherwise, the iterator returns strings with this maximum length.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @stable ICU 4.8
*/
Iterator(const UCharsTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
/**
* Destructor.
* @stable ICU 4.8
*/
~Iterator();
/**
* Resets this iterator to its initial state.
* @return *this
* @stable ICU 4.8
*/
Iterator &reset();
/**
* @return true if there are more elements.
* @stable ICU 4.8
*/
UBool hasNext() const;
/**
* Finds the next (string, value) pair if there is one.
*
* If the string is truncated to the maximum length and does not
* have a real value, then the value is set to -1.
* In this case, this "not a real value" is indistinguishable from
* a real value of -1.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return true if there is another element.
* @stable ICU 4.8
*/
UBool next(UErrorCode &errorCode);
/**
* @return The string for the last successful next().
* @stable ICU 4.8
*/
const UnicodeString &getString() const { return str_; }
/**
* @return The value for the last successful next().
* @stable ICU 4.8
*/
int32_t getValue() const { return value_; }
private:
UBool truncateAndStop() {
pos_=nullptr;
value_=-1; // no real value for str
return true;
}
const char16_t *branchNext(const char16_t *pos, int32_t length, UErrorCode &errorCode);
const char16_t *uchars_;
const char16_t *pos_;
const char16_t *initialPos_;
int32_t remainingMatchLength_;
int32_t initialRemainingMatchLength_;
UBool skipValue_; // Skip intermediate value which was already delivered.
UnicodeString str_;
int32_t maxLength_;
int32_t value_;
// The stack stores pairs of integers for backtracking to another
// outbound edge of a branch node.
// The first integer is an offset from uchars_.
// The second integer has the str_.length() from before the node in bits 15..0,
// and the remaining branch length in bits 31..16.
// (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit,
// but the code looks more confusing that way.)
UVector32 *stack_;
};
private:
friend class UCharsTrieBuilder;
/**
* Constructs a UCharsTrie reader instance.
* Unlike the public constructor which just aliases an array,
* this constructor adopts the builder's array.
* This constructor is only called by the builder.
*/
UCharsTrie(char16_t *adoptUChars, const char16_t *trieUChars)
: ownedArray_(adoptUChars), uchars_(trieUChars),
pos_(uchars_), remainingMatchLength_(-1) {}
// No assignment operator.
UCharsTrie &operator=(const UCharsTrie &other) = delete;
inline void stop() {
pos_=nullptr;
}
// Reads a compact 32-bit integer.
// pos is already after the leadUnit, and the lead unit has bit 15 reset.
static inline int32_t readValue(const char16_t *pos, int32_t leadUnit) {
int32_t value;
if(leadUnit<kMinTwoUnitValueLead) {
value=leadUnit;
} else if(leadUnit<kThreeUnitValueLead) {
value=((leadUnit-kMinTwoUnitValueLead)<<16)|*pos;
} else {
value=(pos[0]<<16)|pos[1];
}
return value;
}
static inline const char16_t *skipValue(const char16_t *pos, int32_t leadUnit) {
if(leadUnit>=kMinTwoUnitValueLead) {
if(leadUnit<kThreeUnitValueLead) {
++pos;
} else {
pos+=2;
}
}
return pos;
}
static inline const char16_t *skipValue(const char16_t *pos) {
int32_t leadUnit=*pos++;
return skipValue(pos, leadUnit&0x7fff);
}
static inline int32_t readNodeValue(const char16_t *pos, int32_t leadUnit) {
// U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
int32_t value;
if(leadUnit<kMinTwoUnitNodeValueLead) {
value=(leadUnit>>6)-1;
} else if(leadUnit<kThreeUnitNodeValueLead) {
value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|*pos;
} else {
value=(pos[0]<<16)|pos[1];
}
return value;
}
static inline const char16_t *skipNodeValue(const char16_t *pos, int32_t leadUnit) {
// U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
if(leadUnit>=kMinTwoUnitNodeValueLead) {
if(leadUnit<kThreeUnitNodeValueLead) {
++pos;
} else {
pos+=2;
}
}
return pos;
}
static inline const char16_t *jumpByDelta(const char16_t *pos) {
int32_t delta=*pos++;
if(delta>=kMinTwoUnitDeltaLead) {
if(delta==kThreeUnitDeltaLead) {
delta=(pos[0]<<16)|pos[1];
pos+=2;
} else {
delta=((delta-kMinTwoUnitDeltaLead)<<16)|*pos++;
}
}
return pos+delta;
}
static const char16_t *skipDelta(const char16_t *pos) {
int32_t delta=*pos++;
if(delta>=kMinTwoUnitDeltaLead) {
if(delta==kThreeUnitDeltaLead) {
pos+=2;
} else {
++pos;
}
}
return pos;
}
static inline UStringTrieResult valueResult(int32_t node) {
return static_cast<UStringTrieResult>(USTRINGTRIE_INTERMEDIATE_VALUE - (node >> 15));
}
// Handles a branch node for both next(uchar) and next(string).
UStringTrieResult branchNext(const char16_t *pos, int32_t length, int32_t uchar);
// Requires remainingLength_<0.
UStringTrieResult nextImpl(const char16_t *pos, int32_t uchar);
// Helper functions for hasUniqueValue().
// Recursively finds a unique value (or whether there is not a unique one)
// from a branch.
static const char16_t *findUniqueValueFromBranch(const char16_t *pos, int32_t length,
UBool haveUniqueValue, int32_t &uniqueValue);
// Recursively finds a unique value (or whether there is not a unique one)
// starting from a position on a node lead unit.
static UBool findUniqueValue(const char16_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
// Helper functions for getNextUChars().
// getNextUChars() when pos is on a branch node.
static void getNextBranchUChars(const char16_t *pos, int32_t length, Appendable &out);
// UCharsTrie data structure
//
// The trie consists of a series of char16_t-serialized nodes for incremental
// Unicode string/char16_t sequence matching. (char16_t=16-bit unsigned integer)
// The root node is at the beginning of the trie data.
//
// Types of nodes are distinguished by their node lead unit ranges.
// After each node, except a final-value node, another node follows to
// encode match values or continue matching further units.
//
// Node types:
// - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
// The value is for the string/char16_t sequence so far.
// - Match node, optionally with an intermediate value in a different compact format.
// The value, if present, is for the string/char16_t sequence so far.
//
// Aside from the value, which uses the node lead unit's high bits:
//
// - Linear-match node: Matches a number of units.
// - Branch node: Branches to other nodes according to the current input unit.
// The node unit is the length of the branch (number of units to select from)
// minus 1. It is followed by a sub-node:
// - If the length is at most kMaxBranchLinearSubNodeLength, then
// there are length-1 (key, value) pairs and then one more comparison unit.
// If one of the key units matches, then the value is either a final value for
// the string so far, or a "jump" delta to the next node.
// If the last unit matches, then matching continues with the next node.
// (Values have the same encoding as final-value nodes.)
// - If the length is greater than kMaxBranchLinearSubNodeLength, then
// there is one unit and one "jump" delta.
// If the input unit is less than the sub-node unit, then "jump" by delta to
// the next sub-node which will have a length of length/2.
// (The delta has its own compact encoding.)
// Otherwise, skip the "jump" delta to the next sub-node
// which will have a length of length-length/2.
// Match-node lead unit values, after masking off intermediate-value bits:
// 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
// the length is one more than the next unit.
// For a branch sub-node with at most this many entries, we drop down
// to a linear search.
static const int32_t kMaxBranchLinearSubNodeLength=5;
// 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
static const int32_t kMinLinearMatch=0x30;
static const int32_t kMaxLinearMatchLength=0x10;
// Match-node lead unit bits 14..6 for the optional intermediate value.
// If these bits are 0, then there is no intermediate value.
// Otherwise, see the *NodeValue* constants below.
static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040
static const int32_t kNodeTypeMask=kMinValueLead-1; // 0x003f
// A final-value node has bit 15 set.
static const int32_t kValueIsFinal=0x8000;
// Compact value: After testing and masking off bit 15, use the following thresholds.
static const int32_t kMaxOneUnitValue=0x3fff;
static const int32_t kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000
static const int32_t kThreeUnitValueLead=0x7fff;
static const int32_t kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff
// Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
static const int32_t kMaxOneUnitNodeValue=0xff;
static const int32_t kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040
static const int32_t kThreeUnitNodeValueLead=0x7fc0;
static const int32_t kMaxTwoUnitNodeValue=
((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff
// Compact delta integers.
static const int32_t kMaxOneUnitDelta=0xfbff;
static const int32_t kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00
static const int32_t kThreeUnitDeltaLead=0xffff;
static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff
// For getState64():
// The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2
// so we need at least 5 bits for that.
// We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength.
static constexpr int32_t kState64RemainingShift = 59;
static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1;
char16_t *ownedArray_;
// Fixed value referencing the UCharsTrie words.
const char16_t *uchars_;
// Iterator variables.
// Pointer to next trie unit to read. nullptr if no more matches.
const char16_t *pos_;
// Remaining length of a linear-match node, minus 1. Negative if not in such a node.
int32_t remainingMatchLength_;
};
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // __UCHARSTRIE_H__

View File

@@ -0,0 +1,193 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2010-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: ucharstriebuilder.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010nov14
* created by: Markus W. Scherer
*/
#ifndef __UCHARSTRIEBUILDER_H__
#define __UCHARSTRIEBUILDER_H__
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/stringtriebuilder.h"
#include "unicode/ucharstrie.h"
#include "unicode/unistr.h"
/**
* \file
* \brief C++ API: Builder for icu::UCharsTrie
*/
U_NAMESPACE_BEGIN
class UCharsTrieElement;
/**
* Builder class for UCharsTrie.
*
* This class is not intended for public subclassing.
* @stable ICU 4.8
*/
class U_COMMON_API UCharsTrieBuilder : public StringTrieBuilder {
public:
/**
* Constructs an empty builder.
* @param errorCode Standard ICU error code.
* @stable ICU 4.8
*/
UCharsTrieBuilder(UErrorCode &errorCode);
/**
* Destructor.
* @stable ICU 4.8
*/
virtual ~UCharsTrieBuilder();
/**
* Adds a (string, value) pair.
* The string must be unique.
* The string contents will be copied; the builder does not keep
* a reference to the input UnicodeString or its buffer.
* @param s The input string.
* @param value The value associated with this string.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return *this
* @stable ICU 4.8
*/
UCharsTrieBuilder &add(const UnicodeString &s, int32_t value, UErrorCode &errorCode);
/**
* Builds a UCharsTrie for the add()ed data.
* Once built, no further data can be add()ed until clear() is called.
*
* A UCharsTrie cannot be empty. At least one (string, value) pair
* must have been add()ed.
*
* This method passes ownership of the builder's internal result array to the new trie object.
* Another call to any build() variant will re-serialize the trie.
* After clear() has been called, a new array will be used as well.
* @param buildOption Build option, see UStringTrieBuildOption.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return A new UCharsTrie for the add()ed data.
* @stable ICU 4.8
*/
UCharsTrie *build(UStringTrieBuildOption buildOption, UErrorCode &errorCode);
/**
* Builds a UCharsTrie for the add()ed data and char16_t-serializes it.
* Once built, no further data can be add()ed until clear() is called.
*
* A UCharsTrie cannot be empty. At least one (string, value) pair
* must have been add()ed.
*
* Multiple calls to buildUnicodeString() set the UnicodeStrings to the
* builder's same char16_t array, without rebuilding.
* If buildUnicodeString() is called after build(), the trie will be
* re-serialized into a new array (because build() passes on ownership).
* If build() is called after buildUnicodeString(), the trie object returned
* by build() will become the owner of the underlying data for the
* previously returned UnicodeString.
* After clear() has been called, a new array will be used as well.
* @param buildOption Build option, see UStringTrieBuildOption.
* @param result A UnicodeString which will be set to the char16_t-serialized
* UCharsTrie for the add()ed data.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return result
* @stable ICU 4.8
*/
UnicodeString &buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
UErrorCode &errorCode);
/**
* Removes all (string, value) pairs.
* New data can then be add()ed and a new trie can be built.
* @return *this
* @stable ICU 4.8
*/
UCharsTrieBuilder &clear() {
strings.remove();
elementsLength=0;
ucharsLength=0;
return *this;
}
private:
UCharsTrieBuilder(const UCharsTrieBuilder &other) = delete; // no copy constructor
UCharsTrieBuilder &operator=(const UCharsTrieBuilder &other) = delete; // no assignment operator
void buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode);
virtual int32_t getElementStringLength(int32_t i) const override;
virtual char16_t getElementUnit(int32_t i, int32_t unitIndex) const override;
virtual int32_t getElementValue(int32_t i) const override;
virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const override;
virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const override;
virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const override;
virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const override;
virtual UBool matchNodesCanHaveValues() const override { return true; }
virtual int32_t getMaxBranchLinearSubNodeLength() const override { return UCharsTrie::kMaxBranchLinearSubNodeLength; }
virtual int32_t getMinLinearMatch() const override { return UCharsTrie::kMinLinearMatch; }
virtual int32_t getMaxLinearMatchLength() const override { return UCharsTrie::kMaxLinearMatchLength; }
class UCTLinearMatchNode : public LinearMatchNode {
public:
UCTLinearMatchNode(const char16_t *units, int32_t len, Node *nextNode);
virtual bool operator==(const Node &other) const override;
virtual void write(StringTrieBuilder &builder) override;
private:
const char16_t *s;
};
virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
Node *nextNode) const override;
UBool ensureCapacity(int32_t length);
virtual int32_t write(int32_t unit) override;
int32_t write(const char16_t *s, int32_t length);
virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) override;
virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) override;
virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) override;
virtual int32_t writeDeltaTo(int32_t jumpTarget) override;
UnicodeString strings;
UCharsTrieElement *elements;
int32_t elementsCapacity;
int32_t elementsLength;
// char16_t serialization of the trie.
// Grows from the back: ucharsLength measures from the end of the buffer!
char16_t *uchars;
int32_t ucharsCapacity;
int32_t ucharsLength;
};
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // __UCHARSTRIEBUILDER_H__

View File

@@ -0,0 +1,393 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 1998-2005, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#ifndef UCHRITER_H
#define UCHRITER_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/chariter.h"
/**
* \file
* \brief C++ API: char16_t Character Iterator
*/
U_NAMESPACE_BEGIN
/**
* A concrete subclass of CharacterIterator that iterates over the
* characters (code units or code points) in a char16_t array.
* It's possible not only to create an
* iterator that iterates over an entire char16_t array, but also to
* create one that iterates over only a subrange of a char16_t array
* (iterators over different subranges of the same char16_t array don't
* compare equal).
* @see CharacterIterator
* @see ForwardCharacterIterator
* @stable ICU 2.0
*/
class U_COMMON_API UCharCharacterIterator : public CharacterIterator {
public:
/**
* Create an iterator over the char16_t array referred to by "textPtr".
* The iteration range is 0 to <code>length-1</code>.
* text is only aliased, not adopted (the
* destructor will not delete it).
* @param textPtr The char16_t array to be iterated over
* @param length The length of the char16_t array
* @stable ICU 2.0
*/
UCharCharacterIterator(ConstChar16Ptr textPtr, int32_t length);
/**
* Create an iterator over the char16_t array referred to by "textPtr".
* The iteration range is 0 to <code>length-1</code>.
* text is only aliased, not adopted (the
* destructor will not delete it).
* The starting
* position is specified by "position". If "position" is outside the valid
* iteration range, the behavior of this object is undefined.
* @param textPtr The char16_t array to be iterated over
* @param length The length of the char16_t array
* @param position The starting position of the iteration
* @stable ICU 2.0
*/
UCharCharacterIterator(ConstChar16Ptr textPtr, int32_t length,
int32_t position);
/**
* Create an iterator over the char16_t array referred to by "textPtr".
* The iteration range is 0 to <code>end-1</code>.
* text is only aliased, not adopted (the
* destructor will not delete it).
* The starting
* position is specified by "position". If begin and end do not
* form a valid iteration range or "position" is outside the valid
* iteration range, the behavior of this object is undefined.
* @param textPtr The char16_t array to be iterated over
* @param length The length of the char16_t array
* @param textBegin The begin position of the iteration range
* @param textEnd The end position of the iteration range
* @param position The starting position of the iteration
* @stable ICU 2.0
*/
UCharCharacterIterator(ConstChar16Ptr textPtr, int32_t length,
int32_t textBegin,
int32_t textEnd,
int32_t position);
/**
* Copy constructor. The new iterator iterates over the same range
* of the same string as "that", and its initial position is the
* same as "that"'s current position.
* @param that The UCharCharacterIterator to be copied
* @stable ICU 2.0
*/
UCharCharacterIterator(const UCharCharacterIterator& that);
/**
* Destructor.
* @stable ICU 2.0
*/
virtual ~UCharCharacterIterator();
/**
* Assignment operator. *this is altered to iterate over the sane
* range of the same string as "that", and refers to the same
* character within that string as "that" does.
* @param that The object to be copied
* @return the newly created object
* @stable ICU 2.0
*/
UCharCharacterIterator&
operator=(const UCharCharacterIterator& that);
/**
* Returns true if the iterators iterate over the same range of the
* same string and are pointing at the same character.
* @param that The ForwardCharacterIterator used to be compared for equality
* @return true if the iterators iterate over the same range of the
* same string and are pointing at the same character.
* @stable ICU 2.0
*/
virtual bool operator==(const ForwardCharacterIterator& that) const override;
/**
* Generates a hash code for this iterator.
* @return the hash code.
* @stable ICU 2.0
*/
virtual int32_t hashCode() const override;
/**
* Returns a new UCharCharacterIterator referring to the same
* character in the same range of the same string as this one. The
* caller must delete the new iterator.
* @return the CharacterIterator newly created
* @stable ICU 2.0
*/
virtual UCharCharacterIterator* clone() const override;
/**
* Sets the iterator to refer to the first code unit in its
* iteration range, and returns that code unit.
* This can be used to begin an iteration with next().
* @return the first code unit in its iteration range.
* @stable ICU 2.0
*/
virtual char16_t first() override;
/**
* Sets the iterator to refer to the first code unit in its
* iteration range, returns that code unit, and moves the position
* to the second code unit. This is an alternative to setToStart()
* for forward iteration with nextPostInc().
* @return the first code unit in its iteration range
* @stable ICU 2.0
*/
virtual char16_t firstPostInc() override;
/**
* Sets the iterator to refer to the first code point in its
* iteration range, and returns that code unit,
* This can be used to begin an iteration with next32().
* Note that an iteration with next32PostInc(), beginning with,
* e.g., setToStart() or firstPostInc(), is more efficient.
* @return the first code point in its iteration range
* @stable ICU 2.0
*/
virtual UChar32 first32() override;
/**
* Sets the iterator to refer to the first code point in its
* iteration range, returns that code point, and moves the position
* to the second code point. This is an alternative to setToStart()
* for forward iteration with next32PostInc().
* @return the first code point in its iteration range.
* @stable ICU 2.0
*/
virtual UChar32 first32PostInc() override;
/**
* Sets the iterator to refer to the last code unit in its
* iteration range, and returns that code unit.
* This can be used to begin an iteration with previous().
* @return the last code unit in its iteration range.
* @stable ICU 2.0
*/
virtual char16_t last() override;
/**
* Sets the iterator to refer to the last code point in its
* iteration range, and returns that code unit.
* This can be used to begin an iteration with previous32().
* @return the last code point in its iteration range.
* @stable ICU 2.0
*/
virtual UChar32 last32() override;
/**
* Sets the iterator to refer to the "position"-th code unit
* in the text-storage object the iterator refers to, and
* returns that code unit.
* @param position the position within the text-storage object
* @return the code unit
* @stable ICU 2.0
*/
virtual char16_t setIndex(int32_t position) override;
/**
* Sets the iterator to refer to the beginning of the code point
* that contains the "position"-th code unit
* in the text-storage object the iterator refers to, and
* returns that code point.
* The current position is adjusted to the beginning of the code point
* (its first code unit).
* @param position the position within the text-storage object
* @return the code unit
* @stable ICU 2.0
*/
virtual UChar32 setIndex32(int32_t position) override;
/**
* Returns the code unit the iterator currently refers to.
* @return the code unit the iterator currently refers to.
* @stable ICU 2.0
*/
virtual char16_t current() const override;
/**
* Returns the code point the iterator currently refers to.
* @return the code point the iterator currently refers to.
* @stable ICU 2.0
*/
virtual UChar32 current32() const override;
/**
* Advances to the next code unit in the iteration range (toward
* endIndex()), and returns that code unit. If there are no more
* code units to return, returns DONE.
* @return the next code unit in the iteration range.
* @stable ICU 2.0
*/
virtual char16_t next() override;
/**
* Gets the current code unit for returning and advances to the next code unit
* in the iteration range
* (toward endIndex()). If there are
* no more code units to return, returns DONE.
* @return the current code unit.
* @stable ICU 2.0
*/
virtual char16_t nextPostInc() override;
/**
* Advances to the next code point in the iteration range (toward
* endIndex()), and returns that code point. If there are no more
* code points to return, returns DONE.
* Note that iteration with "pre-increment" semantics is less
* efficient than iteration with "post-increment" semantics
* that is provided by next32PostInc().
* @return the next code point in the iteration range.
* @stable ICU 2.0
*/
virtual UChar32 next32() override;
/**
* Gets the current code point for returning and advances to the next code point
* in the iteration range
* (toward endIndex()). If there are
* no more code points to return, returns DONE.
* @return the current point.
* @stable ICU 2.0
*/
virtual UChar32 next32PostInc() override;
/**
* Returns false if there are no more code units or code points
* at or after the current position in the iteration range.
* This is used with nextPostInc() or next32PostInc() in forward
* iteration.
* @return false if there are no more code units or code points
* at or after the current position in the iteration range.
* @stable ICU 2.0
*/
virtual UBool hasNext() override;
/**
* Advances to the previous code unit in the iteration range (toward
* startIndex()), and returns that code unit. If there are no more
* code units to return, returns DONE.
* @return the previous code unit in the iteration range.
* @stable ICU 2.0
*/
virtual char16_t previous() override;
/**
* Advances to the previous code point in the iteration range (toward
* startIndex()), and returns that code point. If there are no more
* code points to return, returns DONE.
* @return the previous code point in the iteration range.
* @stable ICU 2.0
*/
virtual UChar32 previous32() override;
/**
* Returns false if there are no more code units or code points
* before the current position in the iteration range.
* This is used with previous() or previous32() in backward
* iteration.
* @return false if there are no more code units or code points
* before the current position in the iteration range.
* @stable ICU 2.0
*/
virtual UBool hasPrevious() override;
/**
* Moves the current position relative to the start or end of the
* iteration range, or relative to the current position itself.
* The movement is expressed in numbers of code units forward
* or backward by specifying a positive or negative delta.
* @param delta the position relative to origin. A positive delta means forward;
* a negative delta means backward.
* @param origin Origin enumeration {kStart, kCurrent, kEnd}
* @return the new position
* @stable ICU 2.0
*/
virtual int32_t move(int32_t delta, EOrigin origin) override;
/**
* Moves the current position relative to the start or end of the
* iteration range, or relative to the current position itself.
* The movement is expressed in numbers of code points forward
* or backward by specifying a positive or negative delta.
* @param delta the position relative to origin. A positive delta means forward;
* a negative delta means backward.
* @param origin Origin enumeration {kStart, kCurrent, kEnd}
* @return the new position
* @stable ICU 2.0
*/
#ifdef move32
// One of the system headers right now is sometimes defining a conflicting macro we don't use
#undef move32
#endif
virtual int32_t move32(int32_t delta, EOrigin origin) override;
/**
* Sets the iterator to iterate over a new range of text
* @stable ICU 2.0
*/
void setText(ConstChar16Ptr newText, int32_t newTextLength);
/**
* Copies the char16_t array under iteration into the UnicodeString
* referred to by "result". Even if this iterator iterates across
* only a part of this string, the whole string is copied.
* @param result Receives a copy of the text under iteration.
* @stable ICU 2.0
*/
virtual void getText(UnicodeString& result) override;
/**
* Return a class ID for this class (not really public)
* @return a class ID for this class
* @stable ICU 2.0
*/
static UClassID U_EXPORT2 getStaticClassID();
/**
* Return a class ID for this object (not really public)
* @return a class ID for this object.
* @stable ICU 2.0
*/
virtual UClassID getDynamicClassID() const override;
protected:
/**
* Protected constructor
* @stable ICU 2.0
*/
UCharCharacterIterator();
/**
* Protected member text
* @stable ICU 2.0
*/
const char16_t* text;
};
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif

262
thirdparty/icu4c/common/unicode/uclean.h vendored Normal file
View File

@@ -0,0 +1,262 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
* Copyright (C) 2001-2014, International Business Machines
* Corporation and others. All Rights Reserved.
******************************************************************************
* file name: uclean.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2001July05
* created by: George Rhoten
*/
#ifndef __UCLEAN_H__
#define __UCLEAN_H__
#include "unicode/utypes.h"
/**
* \file
* \brief C API: Initialize and clean up ICU
*/
/**
* Initialize ICU.
*
* Use of this function is optional. It is OK to simply use ICU
* services and functions without first having initialized
* ICU by calling u_init().
*
* u_init() will attempt to load some part of ICU's data, and is
* useful as a test for configuration or installation problems that
* leave the ICU data inaccessible. A successful invocation of u_init()
* does not, however, guarantee that all ICU data is accessible.
*
* Multiple calls to u_init() cause no harm, aside from the small amount
* of time required.
*
* In old versions of ICU, u_init() was required in multi-threaded applications
* to ensure the thread safety of ICU. u_init() is no longer needed for this purpose.
*
* @param status An ICU UErrorCode parameter. It must not be <code>NULL</code>.
* An Error will be returned if some required part of ICU data can not
* be loaded or initialized.
* The function returns immediately if the input error code indicates a
* failure, as usual.
*
* @stable ICU 2.6
*/
U_CAPI void U_EXPORT2
u_init(UErrorCode *status);
#ifndef U_HIDE_SYSTEM_API
/**
* Clean up the system resources, such as allocated memory or open files,
* used in all ICU libraries. This will free/delete all memory owned by the
* ICU libraries, and return them to their original load state. All open ICU
* items (collators, resource bundles, converters, etc.) must be closed before
* calling this function, otherwise ICU may not free its allocated memory
* (e.g. close your converters and resource bundles before calling this
* function). Generally, this function should be called once just before
* an application exits. For applications that dynamically load and unload
* the ICU libraries (relatively uncommon), u_cleanup() should be called
* just before the library unload.
* <p>
* u_cleanup() also clears any ICU heap functions, mutex functions or
* trace functions that may have been set for the process.
* This has the effect of restoring ICU to its initial condition, before
* any of these override functions were installed. Refer to
* u_setMemoryFunctions(), u_setMutexFunctions and
* utrace_setFunctions(). If ICU is to be reinitialized after
* calling u_cleanup(), these runtime override functions will need to
* be set up again if they are still required.
* <p>
* u_cleanup() is not thread safe. All other threads should stop using ICU
* before calling this function.
* <p>
* Any open ICU items will be left in an undefined state by u_cleanup(),
* and any subsequent attempt to use such an item will give unpredictable
* results.
* <p>
* After calling u_cleanup(), an application may continue to use ICU by
* calling u_init(). An application must invoke u_init() first from one single
* thread before allowing other threads call u_init(). All threads existing
* at the time of the first thread's call to u_init() must also call
* u_init() themselves before continuing with other ICU operations.
* <p>
* The use of u_cleanup() just before an application terminates is optional,
* but it should be called only once for performance reasons. The primary
* benefit is to eliminate reports of memory or resource leaks originating
* in ICU code from the results generated by heap analysis tools.
* <p>
* <strong>Use this function with great care!</strong>
* </p>
*
* @stable ICU 2.0
* @system
*/
U_CAPI void U_EXPORT2
u_cleanup(void);
U_CDECL_BEGIN
/**
* Pointer type for a user supplied memory allocation function.
* @param context user supplied value, obtained from u_setMemoryFunctions().
* @param size The number of bytes to be allocated
* @return Pointer to the newly allocated memory, or NULL if the allocation failed.
* @stable ICU 2.8
* @system
*/
typedef void *U_CALLCONV UMemAllocFn(const void *context, size_t size);
/**
* Pointer type for a user supplied memory re-allocation function.
* @param context user supplied value, obtained from u_setMemoryFunctions().
* @param mem Pointer to the memory block to be resized.
* @param size The new size for the block.
* @return Pointer to the newly allocated memory, or NULL if the allocation failed.
* @stable ICU 2.8
* @system
*/
typedef void *U_CALLCONV UMemReallocFn(const void *context, void *mem, size_t size);
/**
* Pointer type for a user supplied memory free function. Behavior should be
* similar the standard C library free().
* @param context user supplied value, obtained from u_setMemoryFunctions().
* @param mem Pointer to the memory block to be freed.
* @return Pointer to the resized memory block, or NULL if the resizing failed.
* @stable ICU 2.8
* @system
*/
typedef void U_CALLCONV UMemFreeFn (const void *context, void *mem);
/**
* Set the functions that ICU will use for memory allocation.
* Use of this function is optional; by default (without this function), ICU will
* use the standard C library malloc() and free() functions.
* This function can only be used when ICU is in an initial, unused state, before
* u_init() has been called.
* @param context This pointer value will be saved, and then (later) passed as
* a parameter to the memory functions each time they
* are called.
* @param a Pointer to a user-supplied malloc function.
* @param r Pointer to a user-supplied realloc function.
* @param f Pointer to a user-supplied free function.
* @param status Receives error values.
* @stable ICU 2.8
* @system
*/
U_CAPI void U_EXPORT2
u_setMemoryFunctions(const void *context, UMemAllocFn * U_CALLCONV_FPTR a, UMemReallocFn * U_CALLCONV_FPTR r, UMemFreeFn * U_CALLCONV_FPTR f,
UErrorCode *status);
U_CDECL_END
#ifndef U_HIDE_DEPRECATED_API
/*********************************************************************************
*
* Deprecated Functions
*
* The following functions for user supplied mutexes are no longer supported.
* Any attempt to use them will return a U_UNSUPPORTED_ERROR.
*
**********************************************************************************/
/**
* An opaque pointer type that represents an ICU mutex.
* For user-implemented mutexes, the value will typically point to a
* struct or object that implements the mutex.
* @deprecated ICU 52. This type is no longer supported.
* @system
*/
typedef void *UMTX;
U_CDECL_BEGIN
/**
* Function Pointer type for a user supplied mutex initialization function.
* The user-supplied function will be called by ICU whenever ICU needs to create a
* new mutex. The function implementation should create a mutex, and store a pointer
* to something that uniquely identifies the mutex into the UMTX that is supplied
* as a parameter.
* @param context user supplied value, obtained from u_setMutexFunctions().
* @param mutex Receives a pointer that identifies the new mutex.
* The mutex init function must set the UMTX to a non-null value.
* Subsequent calls by ICU to lock, unlock, or destroy a mutex will
* identify the mutex by the UMTX value.
* @param status Error status. Report errors back to ICU by setting this variable
* with an error code.
* @deprecated ICU 52. This function is no longer supported.
* @system
*/
typedef void U_CALLCONV UMtxInitFn (const void *context, UMTX *mutex, UErrorCode* status);
/**
* Function Pointer type for a user supplied mutex functions.
* One of the user-supplied functions with this signature will be called by ICU
* whenever ICU needs to lock, unlock, or destroy a mutex.
* @param context user supplied value, obtained from u_setMutexFunctions().
* @param mutex specify the mutex on which to operate.
* @deprecated ICU 52. This function is no longer supported.
* @system
*/
typedef void U_CALLCONV UMtxFn (const void *context, UMTX *mutex);
U_CDECL_END
/**
* Set the functions that ICU will use for mutex operations
* Use of this function is optional; by default (without this function), ICU will
* directly access system functions for mutex operations
* This function can only be used when ICU is in an initial, unused state, before
* u_init() has been called.
* @param context This pointer value will be saved, and then (later) passed as
* a parameter to the user-supplied mutex functions each time they
* are called.
* @param init Pointer to a mutex initialization function. Must be non-null.
* @param destroy Pointer to the mutex destroy function. Must be non-null.
* @param lock pointer to the mutex lock function. Must be non-null.
* @param unlock Pointer to the mutex unlock function. Must be non-null.
* @param status Receives error values.
* @deprecated ICU 52. This function is no longer supported.
* @system
*/
U_DEPRECATED void U_EXPORT2
u_setMutexFunctions(const void *context, UMtxInitFn *init, UMtxFn *destroy, UMtxFn *lock, UMtxFn *unlock,
UErrorCode *status);
/**
* Pointer type for a user supplied atomic increment or decrement function.
* @param context user supplied value, obtained from u_setAtomicIncDecFunctions().
* @param p Pointer to a 32 bit int to be incremented or decremented
* @return The value of the variable after the inc or dec operation.
* @deprecated ICU 52. This function is no longer supported.
* @system
*/
typedef int32_t U_CALLCONV UMtxAtomicFn(const void *context, int32_t *p);
/**
* Set the functions that ICU will use for atomic increment and decrement of int32_t values.
* Use of this function is optional; by default (without this function), ICU will
* use its own internal implementation of atomic increment/decrement.
* This function can only be used when ICU is in an initial, unused state, before
* u_init() has been called.
* @param context This pointer value will be saved, and then (later) passed as
* a parameter to the increment and decrement functions each time they
* are called. This function can only be called
* @param inc Pointer to a function to do an atomic increment operation. Must be non-null.
* @param dec Pointer to a function to do an atomic decrement operation. Must be non-null.
* @param status Receives error values.
* @deprecated ICU 52. This function is no longer supported.
* @system
*/
U_DEPRECATED void U_EXPORT2
u_setAtomicIncDecFunctions(const void *context, UMtxAtomicFn *inc, UMtxAtomicFn *dec,
UErrorCode *status);
#endif /* U_HIDE_DEPRECATED_API */
#endif /* U_HIDE_SYSTEM_API */
#endif

2053
thirdparty/icu4c/common/unicode/ucnv.h vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,164 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2000-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* ucnv_cb.h:
* External APIs for the ICU's codeset conversion library
* Helena Shih
*
* Modification History:
*
* Date Name Description
*/
/**
* \file
* \brief C API: UConverter functions to aid the writers of callbacks
*
* <h2> Callback API for UConverter </h2>
*
* These functions are provided here for the convenience of the callback
* writer. If you are just looking for callback functions to use, please
* see ucnv_err.h. DO NOT call these functions directly when you are
* working with converters, unless your code has been called as a callback
* via ucnv_setFromUCallback or ucnv_setToUCallback !!
*
* A note about error codes and overflow. Unlike other ICU functions,
* these functions do not expect the error status to be U_ZERO_ERROR.
* Callbacks must be much more careful about their error codes.
* The error codes used here are in/out parameters, which should be passed
* back in the callback's error parameter.
*
* For example, if you call ucnv_cbfromUWriteBytes to write data out
* to the output codepage, it may return U_BUFFER_OVERFLOW_ERROR if
* the data did not fit in the target. But this isn't a failing error,
* in fact, ucnv_cbfromUWriteBytes may be called AGAIN with the error
* status still U_BUFFER_OVERFLOW_ERROR to attempt to write further bytes,
* which will also go into the internal overflow buffers.
*
* Concerning offsets, the 'offset' parameters here are relative to the start
* of SOURCE. For example, Suppose the string "ABCD" was being converted
* from Unicode into a codepage which doesn't have a mapping for 'B'.
* 'A' will be written out correctly, but
* The FromU Callback will be called on an unassigned character for 'B'.
* At this point, this is the state of the world:
* Target: A [..] [points after A]
* Source: A B [C] D [points to C - B has been consumed]
* 0 1 2 3
* codePoint = "B" [the unassigned codepoint]
*
* Now, suppose a callback wants to write the substitution character '?' to
* the target. It calls ucnv_cbFromUWriteBytes() to write the ?.
* It should pass ZERO as the offset, because the offset as far as the
* callback is concerned is relative to the SOURCE pointer [which points
* before 'C'.] If the callback goes into the args and consumes 'C' also,
* it would call FromUWriteBytes with an offset of 1 (and advance the source
* pointer).
*
*/
#ifndef UCNV_CB_H
#define UCNV_CB_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION
#include "unicode/ucnv.h"
#include "unicode/ucnv_err.h"
/**
* ONLY used by FromU callback functions.
* Writes out the specified byte output bytes to the target byte buffer or to converter internal buffers.
*
* @param args callback fromUnicode arguments
* @param source source bytes to write
* @param length length of bytes to write
* @param offsetIndex the relative offset index from callback.
* @param err error status. If <TT>U_BUFFER_OVERFLOW</TT> is returned, then U_BUFFER_OVERFLOW <STRONG>must</STRONG>
* be returned to the user, because it means that not all data could be written into the target buffer, and some is
* in the converter error buffer.
* @see ucnv_cbFromUWriteSub
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2
ucnv_cbFromUWriteBytes (UConverterFromUnicodeArgs *args,
const char* source,
int32_t length,
int32_t offsetIndex,
UErrorCode * err);
/**
* ONLY used by FromU callback functions.
* This function will write out the correct substitution character sequence
* to the target.
*
* @param args callback fromUnicode arguments
* @param offsetIndex the relative offset index from the current source pointer to be used
* @param err error status. If <TT>U_BUFFER_OVERFLOW</TT> is returned, then U_BUFFER_OVERFLOW <STRONG>must</STRONG>
* be returned to the user, because it means that not all data could be written into the target buffer, and some is
* in the converter error buffer.
* @see ucnv_cbFromUWriteBytes
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2
ucnv_cbFromUWriteSub (UConverterFromUnicodeArgs *args,
int32_t offsetIndex,
UErrorCode * err);
/**
* ONLY used by fromU callback functions.
* This function will write out the error character(s) to the target UChar buffer.
*
* @param args callback fromUnicode arguments
* @param source pointer to pointer to first UChar to write [on exit: 1 after last UChar processed]
* @param sourceLimit pointer after last UChar to write
* @param offsetIndex the relative offset index from callback which will be set
* @param err error status <TT>U_BUFFER_OVERFLOW</TT>
* @see ucnv_cbToUWriteSub
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 ucnv_cbFromUWriteUChars(UConverterFromUnicodeArgs *args,
const UChar** source,
const UChar* sourceLimit,
int32_t offsetIndex,
UErrorCode * err);
/**
* ONLY used by ToU callback functions.
* This function will write out the specified characters to the target
* UChar buffer.
*
* @param args callback toUnicode arguments
* @param source source string to write
* @param length the length of source string
* @param offsetIndex the relative offset index which will be written.
* @param err error status <TT>U_BUFFER_OVERFLOW</TT>
* @see ucnv_cbToUWriteSub
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 ucnv_cbToUWriteUChars (UConverterToUnicodeArgs *args,
const UChar* source,
int32_t length,
int32_t offsetIndex,
UErrorCode * err);
/**
* ONLY used by ToU callback functions.
* This function will write out the Unicode substitution character (U+FFFD).
*
* @param args callback fromUnicode arguments
* @param offsetIndex the relative offset index from callback.
* @param err error status <TT>U_BUFFER_OVERFLOW</TT>
* @see ucnv_cbToUWriteUChars
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 ucnv_cbToUWriteSub (UConverterToUnicodeArgs *args,
int32_t offsetIndex,
UErrorCode * err);
#endif
#endif

View File

@@ -0,0 +1,465 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 1999-2009, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
*
* ucnv_err.h:
*/
/**
* \file
* \brief C API: UConverter predefined error callbacks
*
* <h2>Error Behaviour Functions</h2>
* Defines some error behaviour functions called by ucnv_{from,to}Unicode
* These are provided as part of ICU and many are stable, but they
* can also be considered only as an example of what can be done with
* callbacks. You may of course write your own.
*
* If you want to write your own, you may also find the functions from
* ucnv_cb.h useful when writing your own callbacks.
*
* These functions, although public, should NEVER be called directly.
* They should be used as parameters to the ucnv_setFromUCallback
* and ucnv_setToUCallback functions, to set the behaviour of a converter
* when it encounters ILLEGAL/UNMAPPED/INVALID sequences.
*
* usage example: 'STOP' doesn't need any context, but newContext
* could be set to something other than 'NULL' if needed. The available
* contexts in this header can modify the default behavior of the callback.
*
* \code
* UErrorCode err = U_ZERO_ERROR;
* UConverter *myConverter = ucnv_open("ibm-949", &err);
* const void *oldContext;
* UConverterFromUCallback oldAction;
*
*
* if (U_SUCCESS(err))
* {
* ucnv_setFromUCallBack(myConverter,
* UCNV_FROM_U_CALLBACK_STOP,
* NULL,
* &oldAction,
* &oldContext,
* &status);
* }
* \endcode
*
* The code above tells "myConverter" to stop when it encounters an
* ILLEGAL/TRUNCATED/INVALID sequences when it is used to convert from
* Unicode -> Codepage. The behavior from Codepage to Unicode is not changed,
* and ucnv_setToUCallBack would need to be called in order to change
* that behavior too.
*
* Here is an example with a context:
*
* \code
* UErrorCode err = U_ZERO_ERROR;
* UConverter *myConverter = ucnv_open("ibm-949", &err);
* const void *oldContext;
* UConverterFromUCallback oldAction;
*
*
* if (U_SUCCESS(err))
* {
* ucnv_setToUCallBack(myConverter,
* UCNV_TO_U_CALLBACK_SUBSTITUTE,
* UCNV_SUB_STOP_ON_ILLEGAL,
* &oldAction,
* &oldContext,
* &status);
* }
* \endcode
*
* The code above tells "myConverter" to stop when it encounters an
* ILLEGAL/TRUNCATED/INVALID sequences when it is used to convert from
* Codepage -> Unicode. Any unmapped and legal characters will be
* substituted to be the default substitution character.
*/
#ifndef UCNV_ERR_H
#define UCNV_ERR_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION
/** Forward declaring the UConverter structure. @stable ICU 2.0 */
struct UConverter;
/** @stable ICU 2.0 */
typedef struct UConverter UConverter;
/**
* FROM_U, TO_U context options for sub callback
* @stable ICU 2.0
*/
#define UCNV_SUB_STOP_ON_ILLEGAL "i"
/**
* FROM_U, TO_U context options for skip callback
* @stable ICU 2.0
*/
#define UCNV_SKIP_STOP_ON_ILLEGAL "i"
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX)
* @stable ICU 2.0
*/
#define UCNV_ESCAPE_ICU NULL
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX)
* @stable ICU 2.0
*/
#define UCNV_ESCAPE_JAVA "J"
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX)
* TO_U_CALLBACK_ESCAPE option to escape the character value according to C (\\xXXXX)
* @stable ICU 2.0
*/
#define UCNV_ESCAPE_C "C"
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
* TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
* @stable ICU 2.0
*/
#define UCNV_ESCAPE_XML_DEC "D"
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
* TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
* @stable ICU 2.0
*/
#define UCNV_ESCAPE_XML_HEX "X"
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX)
* @stable ICU 2.0
*/
#define UCNV_ESCAPE_UNICODE "U"
/**
* FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to CSS2 conventions (\\HH..H<space>, that is,
* a backslash, 1..6 hex digits, and a space)
* @stable ICU 4.0
*/
#define UCNV_ESCAPE_CSS2 "S"
/**
* The process condition code to be used with the callbacks.
* Codes which are greater than UCNV_IRREGULAR should be
* passed on to any chained callbacks.
* @stable ICU 2.0
*/
typedef enum {
UCNV_UNASSIGNED = 0, /**< The code point is unassigned.
The error code U_INVALID_CHAR_FOUND will be set. */
UCNV_ILLEGAL = 1, /**< The code point is illegal. For example,
\\x81\\x2E is illegal in SJIS because \\x2E
is not a valid trail byte for the \\x81
lead byte.
Also, starting with Unicode 3.0.1, non-shortest byte sequences
in UTF-8 (like \\xC1\\xA1 instead of \\x61 for U+0061)
are also illegal, not just irregular.
The error code U_ILLEGAL_CHAR_FOUND will be set. */
UCNV_IRREGULAR = 2, /**< The codepoint is not a regular sequence in
the encoding. For example, \\xED\\xA0\\x80..\\xED\\xBF\\xBF
are irregular UTF-8 byte sequences for single surrogate
code points.
The error code U_INVALID_CHAR_FOUND will be set. */
UCNV_RESET = 3, /**< The callback is called with this reason when a
'reset' has occurred. Callback should reset all
state. */
UCNV_CLOSE = 4, /**< Called when the converter is closed. The
callback should release any allocated memory.*/
UCNV_CLONE = 5 /**< Called when ucnv_safeClone() is called on the
converter. the pointer available as the
'context' is an alias to the original converters'
context pointer. If the context must be owned
by the new converter, the callback must clone
the data and call ucnv_setFromUCallback
(or setToUCallback) with the correct pointer.
@stable ICU 2.2
*/
} UConverterCallbackReason;
/**
* The structure for the fromUnicode callback function parameter.
* @stable ICU 2.0
*/
typedef struct {
uint16_t size; /**< The size of this struct. @stable ICU 2.0 */
UBool flush; /**< The internal state of converter will be reset and data flushed if set to true. @stable ICU 2.0 */
UConverter *converter; /**< Pointer to the converter that is opened and to which this struct is passed as an argument. @stable ICU 2.0 */
const UChar *source; /**< Pointer to the source source buffer. @stable ICU 2.0 */
const UChar *sourceLimit; /**< Pointer to the limit (end + 1) of source buffer. @stable ICU 2.0 */
char *target; /**< Pointer to the target buffer. @stable ICU 2.0 */
const char *targetLimit; /**< Pointer to the limit (end + 1) of target buffer. @stable ICU 2.0 */
int32_t *offsets; /**< Pointer to the buffer that receives the offsets. *offset = blah ; offset++;. @stable ICU 2.0 */
} UConverterFromUnicodeArgs;
/**
* The structure for the toUnicode callback function parameter.
* @stable ICU 2.0
*/
typedef struct {
uint16_t size; /**< The size of this struct @stable ICU 2.0 */
UBool flush; /**< The internal state of converter will be reset and data flushed if set to true. @stable ICU 2.0 */
UConverter *converter; /**< Pointer to the converter that is opened and to which this struct is passed as an argument. @stable ICU 2.0 */
const char *source; /**< Pointer to the source source buffer. @stable ICU 2.0 */
const char *sourceLimit; /**< Pointer to the limit (end + 1) of source buffer. @stable ICU 2.0 */
UChar *target; /**< Pointer to the target buffer. @stable ICU 2.0 */
const UChar *targetLimit; /**< Pointer to the limit (end + 1) of target buffer. @stable ICU 2.0 */
int32_t *offsets; /**< Pointer to the buffer that receives the offsets. *offset = blah ; offset++;. @stable ICU 2.0 */
} UConverterToUnicodeArgs;
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This From Unicode callback STOPS at the ILLEGAL_SEQUENCE,
* returning the error code back to the caller immediately.
*
* @param context Pointer to the callback's private data
* @param fromUArgs Information about the conversion in progress
* @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
* @param length Size (in bytes) of the concerned codepage sequence
* @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
* @param reason Defines the reason the callback was invoked
* @param err This should always be set to a failure status prior to calling.
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_STOP (
const void *context,
UConverterFromUnicodeArgs *fromUArgs,
const UChar* codeUnits,
int32_t length,
UChar32 codePoint,
UConverterCallbackReason reason,
UErrorCode * err);
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This To Unicode callback STOPS at the ILLEGAL_SEQUENCE,
* returning the error code back to the caller immediately.
*
* @param context Pointer to the callback's private data
* @param toUArgs Information about the conversion in progress
* @param codeUnits Points to 'length' bytes of the concerned codepage sequence
* @param length Size (in bytes) of the concerned codepage sequence
* @param reason Defines the reason the callback was invoked
* @param err This should always be set to a failure status prior to calling.
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_STOP (
const void *context,
UConverterToUnicodeArgs *toUArgs,
const char* codeUnits,
int32_t length,
UConverterCallbackReason reason,
UErrorCode * err);
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This From Unicode callback skips any ILLEGAL_SEQUENCE, or
* skips only UNASSIGNED_SEQUENCE depending on the context parameter
* simply ignoring those characters.
*
* @param context The function currently recognizes the callback options:
* UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
* returning the error code back to the caller immediately.
* NULL: Skips any ILLEGAL_SEQUENCE
* @param fromUArgs Information about the conversion in progress
* @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
* @param length Size (in bytes) of the concerned codepage sequence
* @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
* @param reason Defines the reason the callback was invoked
* @param err Return value will be set to success if the callback was handled,
* otherwise this value will be set to a failure status.
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_SKIP (
const void *context,
UConverterFromUnicodeArgs *fromUArgs,
const UChar* codeUnits,
int32_t length,
UChar32 codePoint,
UConverterCallbackReason reason,
UErrorCode * err);
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This From Unicode callback will Substitute the ILLEGAL SEQUENCE, or
* UNASSIGNED_SEQUENCE depending on context parameter, with the
* current substitution string for the converter. This is the default
* callback.
*
* @param context The function currently recognizes the callback options:
* UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
* returning the error code back to the caller immediately.
* NULL: Substitutes any ILLEGAL_SEQUENCE
* @param fromUArgs Information about the conversion in progress
* @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
* @param length Size (in bytes) of the concerned codepage sequence
* @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
* @param reason Defines the reason the callback was invoked
* @param err Return value will be set to success if the callback was handled,
* otherwise this value will be set to a failure status.
* @see ucnv_setSubstChars
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
const void *context,
UConverterFromUnicodeArgs *fromUArgs,
const UChar* codeUnits,
int32_t length,
UChar32 codePoint,
UConverterCallbackReason reason,
UErrorCode * err);
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This From Unicode callback will Substitute the ILLEGAL SEQUENCE with the
* hexadecimal representation of the illegal codepoints
*
* @param context The function currently recognizes the callback options:
* <ul>
* <li>UCNV_ESCAPE_ICU: Substitutes the ILLEGAL SEQUENCE with the hexadecimal
* representation in the format %UXXXX, e.g. "%uFFFE%u00AC%uC8FE").
* In the Event the converter doesn't support the characters {%,U}[A-F][0-9],
* it will substitute the illegal sequence with the substitution characters.
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
* %UD84D%UDC56</li>
* <li>UCNV_ESCAPE_JAVA: Substitutes the ILLEGAL SEQUENCE with the hexadecimal
* representation in the format \\uXXXX, e.g. "\\uFFFE\\u00AC\\uC8FE").
* In the Event the converter doesn't support the characters {\,u}[A-F][0-9],
* it will substitute the illegal sequence with the substitution characters.
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
* \\uD84D\\uDC56</li>
* <li>UCNV_ESCAPE_C: Substitutes the ILLEGAL SEQUENCE with the hexadecimal
* representation in the format \\uXXXX, e.g. "\\uFFFE\\u00AC\\uC8FE").
* In the Event the converter doesn't support the characters {\,u,U}[A-F][0-9],
* it will substitute the illegal sequence with the substitution characters.
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
* \\U00023456</li>
* <li>UCNV_ESCAPE_XML_DEC: Substitutes the ILLEGAL SEQUENCE with the decimal
* representation in the format \htmlonly&amp;#DDDDDDDD;, e.g. "&amp;#65534;&amp;#172;&amp;#51454;")\endhtmlonly.
* In the Event the converter doesn't support the characters {&amp;,#}[0-9],
* it will substitute the illegal sequence with the substitution characters.
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
* &amp;#144470; and Zero padding is ignored.</li>
* <li>UCNV_ESCAPE_XML_HEX:Substitutes the ILLEGAL SEQUENCE with the decimal
* representation in the format \htmlonly&amp;#xXXXX; e.g. "&amp;#xFFFE;&amp;#x00AC;&amp;#xC8FE;")\endhtmlonly.
* In the Event the converter doesn't support the characters {&,#,x}[0-9],
* it will substitute the illegal sequence with the substitution characters.
* Note that codeUnit(32bit int eg: unit of a surrogate pair) is represented as
* \htmlonly&amp;#x23456;\endhtmlonly</li>
* </ul>
* @param fromUArgs Information about the conversion in progress
* @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
* @param length Size (in bytes) of the concerned codepage sequence
* @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
* @param reason Defines the reason the callback was invoked
* @param err Return value will be set to success if the callback was handled,
* otherwise this value will be set to a failure status.
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 UCNV_FROM_U_CALLBACK_ESCAPE (
const void *context,
UConverterFromUnicodeArgs *fromUArgs,
const UChar* codeUnits,
int32_t length,
UChar32 codePoint,
UConverterCallbackReason reason,
UErrorCode * err);
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This To Unicode callback skips any ILLEGAL_SEQUENCE, or
* skips only UNASSIGNED_SEQUENCE depending on the context parameter
* simply ignoring those characters.
*
* @param context The function currently recognizes the callback options:
* UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
* returning the error code back to the caller immediately.
* NULL: Skips any ILLEGAL_SEQUENCE
* @param toUArgs Information about the conversion in progress
* @param codeUnits Points to 'length' bytes of the concerned codepage sequence
* @param length Size (in bytes) of the concerned codepage sequence
* @param reason Defines the reason the callback was invoked
* @param err Return value will be set to success if the callback was handled,
* otherwise this value will be set to a failure status.
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_SKIP (
const void *context,
UConverterToUnicodeArgs *toUArgs,
const char* codeUnits,
int32_t length,
UConverterCallbackReason reason,
UErrorCode * err);
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This To Unicode callback will Substitute the ILLEGAL SEQUENCE,or
* UNASSIGNED_SEQUENCE depending on context parameter, with the
* Unicode substitution character, U+FFFD.
*
* @param context The function currently recognizes the callback options:
* UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
* returning the error code back to the caller immediately.
* NULL: Substitutes any ILLEGAL_SEQUENCE
* @param toUArgs Information about the conversion in progress
* @param codeUnits Points to 'length' bytes of the concerned codepage sequence
* @param length Size (in bytes) of the concerned codepage sequence
* @param reason Defines the reason the callback was invoked
* @param err Return value will be set to success if the callback was handled,
* otherwise this value will be set to a failure status.
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_SUBSTITUTE (
const void *context,
UConverterToUnicodeArgs *toUArgs,
const char* codeUnits,
int32_t length,
UConverterCallbackReason reason,
UErrorCode * err);
/**
* DO NOT CALL THIS FUNCTION DIRECTLY!
* This To Unicode callback will Substitute the ILLEGAL SEQUENCE with the
* hexadecimal representation of the illegal bytes
* (in the format %XNN, e.g. "%XFF%X0A%XC8%X03").
*
* @param context This function currently recognizes the callback options:
* UCNV_ESCAPE_ICU, UCNV_ESCAPE_JAVA, UCNV_ESCAPE_C, UCNV_ESCAPE_XML_DEC,
* UCNV_ESCAPE_XML_HEX and UCNV_ESCAPE_UNICODE.
* @param toUArgs Information about the conversion in progress
* @param codeUnits Points to 'length' bytes of the concerned codepage sequence
* @param length Size (in bytes) of the concerned codepage sequence
* @param reason Defines the reason the callback was invoked
* @param err Return value will be set to success if the callback was handled,
* otherwise this value will be set to a failure status.
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2 UCNV_TO_U_CALLBACK_ESCAPE (
const void *context,
UConverterToUnicodeArgs *toUArgs,
const char* codeUnits,
int32_t length,
UConverterCallbackReason reason,
UErrorCode * err);
#endif
#endif
/*UCNV_ERR_H*/

View File

@@ -0,0 +1,193 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2008-2011, International Business Machines
* Corporation, Google and others. All Rights Reserved.
*
*******************************************************************************
*/
/*
* Author : eldawy@google.com (Mohamed Eldawy)
* ucnvsel.h
*
* Purpose: To generate a list of encodings capable of handling
* a given Unicode text
*
* Started 09-April-2008
*/
#ifndef __ICU_UCNV_SEL_H__
#define __ICU_UCNV_SEL_H__
#include "unicode/utypes.h"
#if !UCONFIG_NO_CONVERSION
#include "unicode/uset.h"
#include "unicode/utf16.h"
#include "unicode/uenum.h"
#include "unicode/ucnv.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/localpointer.h"
#endif // U_SHOW_CPLUSPLUS_API
/**
* \file
* \brief C API: Encoding/charset encoding selector
*
* A converter selector is built with a set of encoding/charset names
* and given an input string returns the set of names of the
* corresponding converters which can convert the string.
*
* A converter selector can be serialized into a buffer and reopened
* from the serialized form.
*/
struct UConverterSelector;
/**
* @{
* Typedef for selector data structure.
*/
typedef struct UConverterSelector UConverterSelector;
/** @} */
/**
* Open a selector.
* If converterListSize is 0, build for all available converters.
* If excludedCodePoints is NULL, don't exclude any code points.
*
* @param converterList a pointer to encoding names needed to be involved.
* Can be NULL if converterListSize==0.
* The list and the names will be cloned, and the caller
* retains ownership of the original.
* @param converterListSize number of encodings in above list.
* If 0, builds a selector for all available converters.
* @param excludedCodePoints a set of code points to be excluded from consideration.
* That is, excluded code points in a string do not change
* the selection result. (They might be handled by a callback.)
* Use NULL to exclude nothing.
* @param whichSet what converter set to use? Use this to determine whether
* to consider only roundtrip mappings or also fallbacks.
* @param status an in/out ICU UErrorCode
* @return the new selector
*
* @stable ICU 4.2
*/
U_CAPI UConverterSelector* U_EXPORT2
ucnvsel_open(const char* const* converterList, int32_t converterListSize,
const USet* excludedCodePoints,
const UConverterUnicodeSet whichSet, UErrorCode* status);
/**
* Closes a selector.
* If any Enumerations were returned by ucnv_select*, they become invalid.
* They can be closed before or after calling ucnv_closeSelector,
* but should never be used after the selector is closed.
*
* @see ucnv_selectForString
* @see ucnv_selectForUTF8
*
* @param sel selector to close
*
* @stable ICU 4.2
*/
U_CAPI void U_EXPORT2
ucnvsel_close(UConverterSelector *sel);
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalUConverterSelectorPointer
* "Smart pointer" class, closes a UConverterSelector via ucnvsel_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @stable ICU 4.4
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalUConverterSelectorPointer, UConverterSelector, ucnvsel_close);
U_NAMESPACE_END
#endif
/**
* Open a selector from its serialized form.
* The buffer must remain valid and unchanged for the lifetime of the selector.
* This is much faster than creating a selector from scratch.
* Using a serialized form from a different machine (endianness/charset) is supported.
*
* @param buffer pointer to the serialized form of a converter selector;
* must be 32-bit-aligned
* @param length the capacity of this buffer (can be equal to or larger than
* the actual data length)
* @param status an in/out ICU UErrorCode
* @return the new selector
*
* @stable ICU 4.2
*/
U_CAPI UConverterSelector* U_EXPORT2
ucnvsel_openFromSerialized(const void* buffer, int32_t length, UErrorCode* status);
/**
* Serialize a selector into a linear buffer.
* The serialized form is portable to different machines.
*
* @param sel selector to consider
* @param buffer pointer to 32-bit-aligned memory to be filled with the
* serialized form of this converter selector
* @param bufferCapacity the capacity of this buffer
* @param status an in/out ICU UErrorCode
* @return the required buffer capacity to hold serialize data (even if the call fails
* with a U_BUFFER_OVERFLOW_ERROR, it will return the required capacity)
*
* @stable ICU 4.2
*/
U_CAPI int32_t U_EXPORT2
ucnvsel_serialize(const UConverterSelector* sel,
void* buffer, int32_t bufferCapacity, UErrorCode* status);
/**
* Select converters that can map all characters in a UTF-16 string,
* ignoring the excluded code points.
*
* @param sel a selector
* @param s UTF-16 string
* @param length length of the string, or -1 if NUL-terminated
* @param status an in/out ICU UErrorCode
* @return an enumeration containing encoding names.
* The returned encoding names and their order will be the same as
* supplied when building the selector.
*
* @stable ICU 4.2
*/
U_CAPI UEnumeration * U_EXPORT2
ucnvsel_selectForString(const UConverterSelector* sel,
const UChar *s, int32_t length, UErrorCode *status);
/**
* Select converters that can map all characters in a UTF-8 string,
* ignoring the excluded code points.
*
* @param sel a selector
* @param s UTF-8 string
* @param length length of the string, or -1 if NUL-terminated
* @param status an in/out ICU UErrorCode
* @return an enumeration containing encoding names.
* The returned encoding names and their order will be the same as
* supplied when building the selector.
*
* @stable ICU 4.2
*/
U_CAPI UEnumeration * U_EXPORT2
ucnvsel_selectForUTF8(const UConverterSelector* sel,
const char *s, int32_t length, UErrorCode *status);
#endif /* !UCONFIG_NO_CONVERSION */
#endif /* __ICU_UCNV_SEL_H__ */

View File

@@ -0,0 +1,477 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 2002-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: uconfig.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002sep19
* created by: Markus W. Scherer
*/
#ifndef __UCONFIG_H__
#define __UCONFIG_H__
/*!
* \file
* \brief User-configurable settings
*
* Miscellaneous switches:
*
* A number of macros affect a variety of minor aspects of ICU.
* Most of them used to be defined elsewhere (e.g., in utypes.h or platform.h)
* and moved here to make them easier to find.
*
* Switches for excluding parts of ICU library code modules:
*
* Changing these macros allows building partial, smaller libraries for special purposes.
* By default, all modules are built.
* The switches are fairly coarse, controlling large modules.
* Basic services cannot be turned off.
*
* Building with any of these options does not guarantee that the
* ICU build process will completely work. It is recommended that
* the ICU libraries and data be built using the normal build.
* At that time you should remove the data used by those services.
* After building the ICU data library, you should rebuild the ICU
* libraries with these switches customized to your needs.
*
* @stable ICU 2.4
*/
/**
* If this switch is defined, ICU will attempt to load a header file named "uconfig_local.h"
* prior to determining default settings for uconfig variables.
*
* @internal ICU 4.0
*/
#if defined(UCONFIG_USE_LOCAL)
#include "uconfig_local.h"
#endif
/**
* \def U_DEBUG
* Determines whether to include debugging code.
* Automatically set on Windows, but most compilers do not have
* related predefined macros.
* @internal
*/
#ifdef U_DEBUG
/* Use the predefined value. */
#elif defined(_DEBUG)
/*
* _DEBUG is defined by Visual Studio debug compilation.
* Do *not* test for its NDEBUG macro: It is an orthogonal macro
* which disables assert().
*/
# define U_DEBUG 1
# else
# define U_DEBUG 0
#endif
/**
* Determines whether to enable auto cleanup of libraries.
* @internal
*/
#ifndef UCLN_NO_AUTO_CLEANUP
#define UCLN_NO_AUTO_CLEANUP 1
#endif
/**
* \def U_DISABLE_RENAMING
* Determines whether to disable renaming or not.
* @internal
*/
#ifndef U_DISABLE_RENAMING
#define U_DISABLE_RENAMING 0
#endif
/**
* \def U_NO_DEFAULT_INCLUDE_UTF_HEADERS
* Determines whether utypes.h includes utf.h, utf8.h, utf16.h and utf_old.h.
* utypes.h includes those headers if this macro is defined to 0.
* Otherwise, each those headers must be included explicitly when using one of their macros.
* Defaults to 0 for backward compatibility, except inside ICU.
* @stable ICU 49
*/
#ifdef U_NO_DEFAULT_INCLUDE_UTF_HEADERS
/* Use the predefined value. */
#elif defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || defined(U_I18N_IMPLEMENTATION) || \
defined(U_IO_IMPLEMENTATION) || defined(U_LAYOUT_IMPLEMENTATION) || defined(U_LAYOUTEX_IMPLEMENTATION) || \
defined(U_TOOLUTIL_IMPLEMENTATION)
# define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 1
#else
# define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 0
#endif
/**
* \def U_OVERRIDE_CXX_ALLOCATION
* Determines whether to override new and delete.
* ICU is normally built such that all of its C++ classes, via their UMemory base,
* override operators new and delete to use its internal, customizable,
* non-exception-throwing memory allocation functions. (Default value 1 for this macro.)
*
* This is especially important when the application and its libraries use multiple heaps.
* For example, on Windows, this allows the ICU DLL to be used by
* applications that statically link the C Runtime library.
*
* @stable ICU 2.2
*/
#ifndef U_OVERRIDE_CXX_ALLOCATION
#define U_OVERRIDE_CXX_ALLOCATION 1
#endif
/**
* \def U_ENABLE_TRACING
* Determines whether to enable tracing.
* @internal
*/
#ifndef U_ENABLE_TRACING
#define U_ENABLE_TRACING 0
#endif
/**
* \def UCONFIG_ENABLE_PLUGINS
* Determines whether to enable ICU plugins.
* @internal
*/
#ifndef UCONFIG_ENABLE_PLUGINS
#define UCONFIG_ENABLE_PLUGINS 0
#endif
/**
* \def U_ENABLE_DYLOAD
* Whether to enable Dynamic loading in ICU.
* @internal
*/
#ifndef U_ENABLE_DYLOAD
#define U_ENABLE_DYLOAD 1
#endif
/**
* \def U_CHECK_DYLOAD
* Whether to test Dynamic loading as an OS capability.
* @internal
*/
#ifndef U_CHECK_DYLOAD
#define U_CHECK_DYLOAD 1
#endif
/**
* \def U_DEFAULT_SHOW_DRAFT
* Do we allow ICU users to use the draft APIs by default?
* @internal
*/
#ifndef U_DEFAULT_SHOW_DRAFT
#define U_DEFAULT_SHOW_DRAFT 1
#endif
/*===========================================================================*/
/* Custom icu entry point renaming */
/*===========================================================================*/
/**
* \def U_HAVE_LIB_SUFFIX
* 1 if a custom library suffix is set.
* @internal
*/
#ifdef U_HAVE_LIB_SUFFIX
/* Use the predefined value. */
#elif defined(U_LIB_SUFFIX_C_NAME) || defined(U_IN_DOXYGEN)
# define U_HAVE_LIB_SUFFIX 1
#endif
/**
* \def U_LIB_SUFFIX_C_NAME_STRING
* Defines the library suffix as a string with C syntax.
* @internal
*/
#ifdef U_LIB_SUFFIX_C_NAME_STRING
/* Use the predefined value. */
#elif defined(U_LIB_SUFFIX_C_NAME)
# define CONVERT_TO_STRING(s) #s
# define U_LIB_SUFFIX_C_NAME_STRING CONVERT_TO_STRING(U_LIB_SUFFIX_C_NAME)
#else
# define U_LIB_SUFFIX_C_NAME_STRING ""
#endif
/* common/i18n library switches --------------------------------------------- */
/**
* \def UCONFIG_ONLY_COLLATION
* This switch turns off modules that are not needed for collation.
*
* It does not turn off legacy conversion because that is necessary
* for ICU to work on EBCDIC platforms (for the default converter).
* If you want "only collation" and do not build for EBCDIC,
* then you can define UCONFIG_NO_CONVERSION or UCONFIG_NO_LEGACY_CONVERSION to 1 as well.
*
* @stable ICU 2.4
*/
#ifndef UCONFIG_ONLY_COLLATION
# define UCONFIG_ONLY_COLLATION 0
#endif
#if UCONFIG_ONLY_COLLATION
/* common library */
# define UCONFIG_NO_BREAK_ITERATION 1
# define UCONFIG_NO_IDNA 1
/* i18n library */
# if UCONFIG_NO_COLLATION
# error Contradictory collation switches in uconfig.h.
# endif
# define UCONFIG_NO_FORMATTING 1
# define UCONFIG_NO_TRANSLITERATION 1
# define UCONFIG_NO_REGULAR_EXPRESSIONS 1
#endif
/* common library switches -------------------------------------------------- */
/**
* \def UCONFIG_NO_FILE_IO
* This switch turns off all file access in the common library
* where file access is only used for data loading.
* ICU data must then be provided in the form of a data DLL (or with an
* equivalent way to link to the data residing in an executable,
* as in building a combined library with both the common library's code and
* the data), or via udata_setCommonData().
* Application data must be provided via udata_setAppData() or by using
* "open" functions that take pointers to data, for example ucol_openBinary().
*
* File access is not used at all in the i18n library.
*
* File access cannot be turned off for the icuio library or for the ICU
* test suites and ICU tools.
*
* @stable ICU 3.6
*/
#ifndef UCONFIG_NO_FILE_IO
# define UCONFIG_NO_FILE_IO 0
#endif
#if UCONFIG_NO_FILE_IO && defined(U_TIMEZONE_FILES_DIR)
# error Contradictory file io switches in uconfig.h.
#endif
/**
* \def UCONFIG_NO_CONVERSION
* ICU will not completely build (compiling the tools fails) with this
* switch turned on.
* This switch turns off all converters.
*
* You may want to use this together with U_CHARSET_IS_UTF8 defined to 1
* in utypes.h if char* strings in your environment are always in UTF-8.
*
* @stable ICU 3.2
* @see U_CHARSET_IS_UTF8
*/
#ifndef UCONFIG_NO_CONVERSION
# define UCONFIG_NO_CONVERSION 0
#endif
#if UCONFIG_NO_CONVERSION
# define UCONFIG_NO_LEGACY_CONVERSION 1
#endif
/**
* \def UCONFIG_ONLY_HTML_CONVERSION
* This switch turns off all of the converters NOT listed in
* the HTML encoding standard:
* http://www.w3.org/TR/encoding/#names-and-labels
*
* This is not possible on EBCDIC platforms
* because they need ibm-37 or ibm-1047 default converters.
*
* @stable ICU 55
*/
#ifndef UCONFIG_ONLY_HTML_CONVERSION
# define UCONFIG_ONLY_HTML_CONVERSION 0
#endif
/**
* \def UCONFIG_NO_LEGACY_CONVERSION
* This switch turns off all converters except for
* - Unicode charsets (UTF-7/8/16/32, CESU-8, SCSU, BOCU-1)
* - US-ASCII
* - ISO-8859-1
*
* Turning off legacy conversion is not possible on EBCDIC platforms
* because they need ibm-37 or ibm-1047 default converters.
*
* @stable ICU 2.4
*/
#ifndef UCONFIG_NO_LEGACY_CONVERSION
# define UCONFIG_NO_LEGACY_CONVERSION 0
#endif
/**
* \def UCONFIG_NO_NORMALIZATION
* This switch turns off normalization.
* It implies turning off several other services as well, for example
* collation and IDNA.
*
* @stable ICU 2.6
*/
#ifndef UCONFIG_NO_NORMALIZATION
# define UCONFIG_NO_NORMALIZATION 0
#endif
/**
* \def UCONFIG_USE_ML_PHRASE_BREAKING
* This switch turns on BudouX ML phrase-based line breaking, rather than using the dictionary.
*
* @internal
*/
#ifndef UCONFIG_USE_ML_PHRASE_BREAKING
# define UCONFIG_USE_ML_PHRASE_BREAKING 0
#endif
#if UCONFIG_NO_NORMALIZATION
/* common library */
/* ICU 50 CJK dictionary BreakIterator uses normalization */
# define UCONFIG_NO_BREAK_ITERATION 1
/* IDNA (UTS #46) is implemented via normalization */
# define UCONFIG_NO_IDNA 1
/* i18n library */
# if UCONFIG_ONLY_COLLATION
# error Contradictory collation switches in uconfig.h.
# endif
# define UCONFIG_NO_COLLATION 1
# define UCONFIG_NO_TRANSLITERATION 1
#endif
/**
* \def UCONFIG_NO_BREAK_ITERATION
* This switch turns off break iteration.
*
* @stable ICU 2.4
*/
#ifndef UCONFIG_NO_BREAK_ITERATION
# define UCONFIG_NO_BREAK_ITERATION 0
#endif
/**
* \def UCONFIG_NO_IDNA
* This switch turns off IDNA.
*
* @stable ICU 2.6
*/
#ifndef UCONFIG_NO_IDNA
# define UCONFIG_NO_IDNA 0
#endif
/**
* \def UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE
* Determines the default UMessagePatternApostropheMode.
* See the documentation for that enum.
*
* @stable ICU 4.8
*/
#ifndef UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE
# define UCONFIG_MSGPAT_DEFAULT_APOSTROPHE_MODE UMSGPAT_APOS_DOUBLE_OPTIONAL
#endif
/**
* \def UCONFIG_USE_WINDOWS_LCID_MAPPING_API
* On platforms where U_PLATFORM_HAS_WIN32_API is true, this switch determines
* if the Windows platform APIs are used for LCID<->Locale Name conversions.
* Otherwise, only the built-in ICU tables are used.
*
* @internal ICU 64
*/
#ifndef UCONFIG_USE_WINDOWS_LCID_MAPPING_API
# define UCONFIG_USE_WINDOWS_LCID_MAPPING_API 1
#endif
/* i18n library switches ---------------------------------------------------- */
/**
* \def UCONFIG_NO_COLLATION
* This switch turns off collation and collation-based string search.
*
* @stable ICU 2.4
*/
#ifndef UCONFIG_NO_COLLATION
# define UCONFIG_NO_COLLATION 0
#endif
/**
* \def UCONFIG_NO_FORMATTING
* This switch turns off formatting and calendar/timezone services.
*
* @stable ICU 2.4
*/
#ifndef UCONFIG_NO_FORMATTING
# define UCONFIG_NO_FORMATTING 0
#endif
/**
* \def UCONFIG_NO_MF2
* This switch turns off the experimental MessageFormat 2.0 API.
*
* @internal ICU 75 technology preview
* @deprecated This API is for technology preview only.
*/
#ifndef UCONFIG_NO_MF2
# define UCONFIG_NO_MF2 0
#endif
/**
* \def UCONFIG_NO_TRANSLITERATION
* This switch turns off transliteration.
*
* @stable ICU 2.4
*/
#ifndef UCONFIG_NO_TRANSLITERATION
# define UCONFIG_NO_TRANSLITERATION 0
#endif
/**
* \def UCONFIG_NO_REGULAR_EXPRESSIONS
* This switch turns off regular expressions.
*
* @stable ICU 2.4
*/
#ifndef UCONFIG_NO_REGULAR_EXPRESSIONS
# define UCONFIG_NO_REGULAR_EXPRESSIONS 0
#endif
/**
* \def UCONFIG_NO_SERVICE
* This switch turns off service registration.
*
* @stable ICU 3.2
*/
#ifndef UCONFIG_NO_SERVICE
# define UCONFIG_NO_SERVICE 0
#endif
/**
* \def UCONFIG_HAVE_PARSEALLINPUT
* This switch turns on the "parse all input" attribute. Binary incompatible.
*
* @internal
*/
#ifndef UCONFIG_HAVE_PARSEALLINPUT
# define UCONFIG_HAVE_PARSEALLINPUT 1
#endif
/**
* \def UCONFIG_NO_FILTERED_BREAK_ITERATION
* This switch turns off filtered break iteration code.
*
* @internal
*/
#ifndef UCONFIG_NO_FILTERED_BREAK_ITERATION
# define UCONFIG_NO_FILTERED_BREAK_ITERATION 0
#endif
#endif // __UCONFIG_H__

158
thirdparty/icu4c/common/unicode/ucpmap.h vendored Normal file
View File

@@ -0,0 +1,158 @@
// © 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// ucpmap.h
// created: 2018sep03 Markus W. Scherer
#ifndef __UCPMAP_H__
#define __UCPMAP_H__
#include "unicode/utypes.h"
U_CDECL_BEGIN
/**
* \file
* \brief C API: This file defines an abstract map from Unicode code points to integer values.
*
* @see UCPMap
* @see UCPTrie
* @see UMutableCPTrie
*/
/**
* Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values.
*
* @see UCPTrie
* @see UMutableCPTrie
* @stable ICU 63
*/
typedef struct UCPMap UCPMap;
/**
* Selectors for how ucpmap_getRange() etc. should report value ranges overlapping with surrogates.
* Most users should use UCPMAP_RANGE_NORMAL.
*
* @see ucpmap_getRange
* @see ucptrie_getRange
* @see umutablecptrie_getRange
* @stable ICU 63
*/
enum UCPMapRangeOption {
/**
* ucpmap_getRange() enumerates all same-value ranges as stored in the map.
* Most users should use this option.
* @stable ICU 63
*/
UCPMAP_RANGE_NORMAL,
/**
* ucpmap_getRange() enumerates all same-value ranges as stored in the map,
* except that lead surrogates (U+D800..U+DBFF) are treated as having the
* surrogateValue, which is passed to getRange() as a separate parameter.
* The surrogateValue is not transformed via filter().
* See U_IS_LEAD(c).
*
* Most users should use UCPMAP_RANGE_NORMAL instead.
*
* This option is useful for maps that map surrogate code *units* to
* special values optimized for UTF-16 string processing
* or for special error behavior for unpaired surrogates,
* but those values are not to be associated with the lead surrogate code *points*.
* @stable ICU 63
*/
UCPMAP_RANGE_FIXED_LEAD_SURROGATES,
/**
* ucpmap_getRange() enumerates all same-value ranges as stored in the map,
* except that all surrogates (U+D800..U+DFFF) are treated as having the
* surrogateValue, which is passed to getRange() as a separate parameter.
* The surrogateValue is not transformed via filter().
* See U_IS_SURROGATE(c).
*
* Most users should use UCPMAP_RANGE_NORMAL instead.
*
* This option is useful for maps that map surrogate code *units* to
* special values optimized for UTF-16 string processing
* or for special error behavior for unpaired surrogates,
* but those values are not to be associated with the lead surrogate code *points*.
* @stable ICU 63
*/
UCPMAP_RANGE_FIXED_ALL_SURROGATES
};
#ifndef U_IN_DOXYGEN
typedef enum UCPMapRangeOption UCPMapRangeOption;
#endif
/**
* Returns the value for a code point as stored in the map, with range checking.
* Returns an implementation-defined error value if c is not in the range 0..U+10FFFF.
*
* @param map the map
* @param c the code point
* @return the map value,
* or an implementation-defined error value if the code point is not in the range 0..U+10FFFF
* @stable ICU 63
*/
U_CAPI uint32_t U_EXPORT2
ucpmap_get(const UCPMap *map, UChar32 c);
/**
* Callback function type: Modifies a map value.
* Optionally called by ucpmap_getRange()/ucptrie_getRange()/umutablecptrie_getRange().
* The modified value will be returned by the getRange function.
*
* Can be used to ignore some of the value bits,
* make a filter for one of several values,
* return a value index computed from the map value, etc.
*
* @param context an opaque pointer, as passed into the getRange function
* @param value a value from the map
* @return the modified value
* @stable ICU 63
*/
typedef uint32_t U_CALLCONV
UCPMapValueFilter(const void *context, uint32_t value);
/**
* Returns the last code point such that all those from start to there have the same value.
* Can be used to efficiently iterate over all same-value ranges in a map.
* (This is normally faster than iterating over code points and get()ting each value,
* but much slower than a data structure that stores ranges directly.)
*
* If the UCPMapValueFilter function pointer is not NULL, then
* the value to be delivered is passed through that function, and the return value is the end
* of the range where all values are modified to the same actual value.
* The value is unchanged if that function pointer is NULL.
*
* Example:
* \code
* UChar32 start = 0, end;
* uint32_t value;
* while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0,
* NULL, NULL, &value)) >= 0) {
* // Work with the range start..end and its value.
* start = end + 1;
* }
* \endcode
*
* @param map the map
* @param start range start
* @param option defines whether surrogates are treated normally,
* or as having the surrogateValue; usually UCPMAP_RANGE_NORMAL
* @param surrogateValue value for surrogates; ignored if option==UCPMAP_RANGE_NORMAL
* @param filter a pointer to a function that may modify the map data value,
* or NULL if the values from the map are to be used unmodified
* @param context an opaque pointer that is passed on to the filter function
* @param pValue if not NULL, receives the value that every code point start..end has;
* may have been modified by filter(context, map value)
* if that function pointer is not NULL
* @return the range end code point, or -1 if start is not a valid code point
* @stable ICU 63
*/
U_CAPI UChar32 U_EXPORT2
ucpmap_getRange(const UCPMap *map, UChar32 start,
UCPMapRangeOption option, uint32_t surrogateValue,
UCPMapValueFilter *filter, const void *context, uint32_t *pValue);
U_CDECL_END
#endif

View File

@@ -0,0 +1,645 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// ucptrie.h (modified from utrie2.h)
// created: 2017dec29 Markus W. Scherer
#ifndef __UCPTRIE_H__
#define __UCPTRIE_H__
#include "unicode/utypes.h"
#include "unicode/ucpmap.h"
#include "unicode/utf8.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/localpointer.h"
#endif // U_SHOW_CPLUSPLUS_API
U_CDECL_BEGIN
/**
* \file
* \brief C API: This file defines an immutable Unicode code point trie.
*
* @see UCPTrie
* @see UMutableCPTrie
*/
#ifndef U_IN_DOXYGEN
/** @internal */
typedef union UCPTrieData {
/** @internal */
const void *ptr0;
/** @internal */
const uint16_t *ptr16;
/** @internal */
const uint32_t *ptr32;
/** @internal */
const uint8_t *ptr8;
} UCPTrieData;
#endif
/**
* Immutable Unicode code point trie structure.
* Fast, reasonably compact, map from Unicode code points (U+0000..U+10FFFF) to integer values.
* For details see https://icu.unicode.org/design/struct/utrie
*
* Do not access UCPTrie fields directly; use public functions and macros.
* Functions are easy to use: They support all trie types and value widths.
*
* When performance is really important, macros provide faster access.
* Most macros are specific to either "fast" or "small" tries, see UCPTrieType.
* There are "fast" macros for special optimized use cases.
*
* The macros will return bogus values, or may crash, if used on the wrong type or value width.
*
* @see UMutableCPTrie
* @stable ICU 63
*/
struct UCPTrie {
#ifndef U_IN_DOXYGEN
/** @internal */
const uint16_t *index;
/** @internal */
UCPTrieData data;
/** @internal */
int32_t indexLength;
/** @internal */
int32_t dataLength;
/** Start of the last range which ends at U+10FFFF. @internal */
UChar32 highStart;
/** highStart>>12 @internal */
uint16_t shifted12HighStart;
/** @internal */
int8_t type; // UCPTrieType
/** @internal */
int8_t valueWidth; // UCPTrieValueWidth
/** padding/reserved @internal */
uint32_t reserved32;
/** padding/reserved @internal */
uint16_t reserved16;
/**
* Internal index-3 null block offset.
* Set to an impossibly high value (e.g., 0xffff) if there is no dedicated index-3 null block.
* @internal
*/
uint16_t index3NullOffset;
/**
* Internal data null block offset, not shifted.
* Set to an impossibly high value (e.g., 0xfffff) if there is no dedicated data null block.
* @internal
*/
int32_t dataNullOffset;
/** @internal */
uint32_t nullValue;
#ifdef UCPTRIE_DEBUG
/** @internal */
const char *name;
#endif
#endif
};
#ifndef U_IN_DOXYGEN
typedef struct UCPTrie UCPTrie;
#endif
/**
* Selectors for the type of a UCPTrie.
* Different trade-offs for size vs. speed.
*
* @see umutablecptrie_buildImmutable
* @see ucptrie_openFromBinary
* @see ucptrie_getType
* @stable ICU 63
*/
enum UCPTrieType {
/**
* For ucptrie_openFromBinary() to accept any type.
* ucptrie_getType() will return the actual type.
* @stable ICU 63
*/
UCPTRIE_TYPE_ANY = -1,
/**
* Fast/simple/larger BMP data structure. Use functions and "fast" macros.
* @stable ICU 63
*/
UCPTRIE_TYPE_FAST,
/**
* Small/slower BMP data structure. Use functions and "small" macros.
* @stable ICU 63
*/
UCPTRIE_TYPE_SMALL
};
#ifndef U_IN_DOXYGEN
typedef enum UCPTrieType UCPTrieType;
#endif
/**
* Selectors for the number of bits in a UCPTrie data value.
*
* @see umutablecptrie_buildImmutable
* @see ucptrie_openFromBinary
* @see ucptrie_getValueWidth
* @stable ICU 63
*/
enum UCPTrieValueWidth {
/**
* For ucptrie_openFromBinary() to accept any data value width.
* ucptrie_getValueWidth() will return the actual data value width.
* @stable ICU 63
*/
UCPTRIE_VALUE_BITS_ANY = -1,
/**
* The trie stores 16 bits per data value.
* It returns them as unsigned values 0..0xffff=65535.
* @stable ICU 63
*/
UCPTRIE_VALUE_BITS_16,
/**
* The trie stores 32 bits per data value.
* @stable ICU 63
*/
UCPTRIE_VALUE_BITS_32,
/**
* The trie stores 8 bits per data value.
* It returns them as unsigned values 0..0xff=255.
* @stable ICU 63
*/
UCPTRIE_VALUE_BITS_8
};
#ifndef U_IN_DOXYGEN
typedef enum UCPTrieValueWidth UCPTrieValueWidth;
#endif
/**
* Opens a trie from its binary form, stored in 32-bit-aligned memory.
* Inverse of ucptrie_toBinary().
*
* The memory must remain valid and unchanged as long as the trie is used.
* You must ucptrie_close() the trie once you are done using it.
*
* @param type selects the trie type; results in an
* U_INVALID_FORMAT_ERROR if it does not match the binary data;
* use UCPTRIE_TYPE_ANY to accept any type
* @param valueWidth selects the number of bits in a data value; results in an
* U_INVALID_FORMAT_ERROR if it does not match the binary data;
* use UCPTRIE_VALUE_BITS_ANY to accept any data value width
* @param data a pointer to 32-bit-aligned memory containing the binary data of a UCPTrie
* @param length the number of bytes available at data;
* can be more than necessary
* @param pActualLength receives the actual number of bytes at data taken up by the trie data;
* can be NULL
* @param pErrorCode an in/out ICU UErrorCode
* @return the trie
*
* @see umutablecptrie_open
* @see umutablecptrie_buildImmutable
* @see ucptrie_toBinary
* @stable ICU 63
*/
U_CAPI UCPTrie * U_EXPORT2
ucptrie_openFromBinary(UCPTrieType type, UCPTrieValueWidth valueWidth,
const void *data, int32_t length, int32_t *pActualLength,
UErrorCode *pErrorCode);
/**
* Closes a trie and releases associated memory.
*
* @param trie the trie
* @stable ICU 63
*/
U_CAPI void U_EXPORT2
ucptrie_close(UCPTrie *trie);
/**
* Returns the trie type.
*
* @param trie the trie
* @return the trie type
* @see ucptrie_openFromBinary
* @see UCPTRIE_TYPE_ANY
* @stable ICU 63
*/
U_CAPI UCPTrieType U_EXPORT2
ucptrie_getType(const UCPTrie *trie);
/**
* Returns the number of bits in a trie data value.
*
* @param trie the trie
* @return the number of bits in a trie data value
* @see ucptrie_openFromBinary
* @see UCPTRIE_VALUE_BITS_ANY
* @stable ICU 63
*/
U_CAPI UCPTrieValueWidth U_EXPORT2
ucptrie_getValueWidth(const UCPTrie *trie);
/**
* Returns the value for a code point as stored in the trie, with range checking.
* Returns the trie error value if c is not in the range 0..U+10FFFF.
*
* Easier to use than UCPTRIE_FAST_GET() and similar macros but slower.
* Easier to use because, unlike the macros, this function works on all UCPTrie
* objects, for all types and value widths.
*
* @param trie the trie
* @param c the code point
* @return the trie value,
* or the trie error value if the code point is not in the range 0..U+10FFFF
* @stable ICU 63
*/
U_CAPI uint32_t U_EXPORT2
ucptrie_get(const UCPTrie *trie, UChar32 c);
/**
* Returns the last code point such that all those from start to there have the same value.
* Can be used to efficiently iterate over all same-value ranges in a trie.
* (This is normally faster than iterating over code points and get()ting each value,
* but much slower than a data structure that stores ranges directly.)
*
* If the UCPMapValueFilter function pointer is not NULL, then
* the value to be delivered is passed through that function, and the return value is the end
* of the range where all values are modified to the same actual value.
* The value is unchanged if that function pointer is NULL.
*
* Example:
* \code
* UChar32 start = 0, end;
* uint32_t value;
* while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0,
* NULL, NULL, &value)) >= 0) {
* // Work with the range start..end and its value.
* start = end + 1;
* }
* \endcode
*
* @param trie the trie
* @param start range start
* @param option defines whether surrogates are treated normally,
* or as having the surrogateValue; usually UCPMAP_RANGE_NORMAL
* @param surrogateValue value for surrogates; ignored if option==UCPMAP_RANGE_NORMAL
* @param filter a pointer to a function that may modify the trie data value,
* or NULL if the values from the trie are to be used unmodified
* @param context an opaque pointer that is passed on to the filter function
* @param pValue if not NULL, receives the value that every code point start..end has;
* may have been modified by filter(context, trie value)
* if that function pointer is not NULL
* @return the range end code point, or -1 if start is not a valid code point
* @stable ICU 63
*/
U_CAPI UChar32 U_EXPORT2
ucptrie_getRange(const UCPTrie *trie, UChar32 start,
UCPMapRangeOption option, uint32_t surrogateValue,
UCPMapValueFilter *filter, const void *context, uint32_t *pValue);
/**
* Writes a memory-mappable form of the trie into 32-bit aligned memory.
* Inverse of ucptrie_openFromBinary().
*
* @param trie the trie
* @param data a pointer to 32-bit-aligned memory to be filled with the trie data;
* can be NULL if capacity==0
* @param capacity the number of bytes available at data, or 0 for pure preflighting
* @param pErrorCode an in/out ICU UErrorCode;
* U_BUFFER_OVERFLOW_ERROR if the capacity is too small
* @return the number of bytes written or (if buffer overflow) needed for the trie
*
* @see ucptrie_openFromBinary()
* @stable ICU 63
*/
U_CAPI int32_t U_EXPORT2
ucptrie_toBinary(const UCPTrie *trie, void *data, int32_t capacity, UErrorCode *pErrorCode);
/**
* Macro parameter value for a trie with 16-bit data values.
* Use the name of this macro as a "dataAccess" parameter in other macros.
* Do not use this macro in any other way.
*
* @see UCPTRIE_VALUE_BITS_16
* @stable ICU 63
*/
#define UCPTRIE_16(trie, i) ((trie)->data.ptr16[i])
/**
* Macro parameter value for a trie with 32-bit data values.
* Use the name of this macro as a "dataAccess" parameter in other macros.
* Do not use this macro in any other way.
*
* @see UCPTRIE_VALUE_BITS_32
* @stable ICU 63
*/
#define UCPTRIE_32(trie, i) ((trie)->data.ptr32[i])
/**
* Macro parameter value for a trie with 8-bit data values.
* Use the name of this macro as a "dataAccess" parameter in other macros.
* Do not use this macro in any other way.
*
* @see UCPTRIE_VALUE_BITS_8
* @stable ICU 63
*/
#define UCPTRIE_8(trie, i) ((trie)->data.ptr8[i])
/**
* Returns a trie value for a code point, with range checking.
* Returns the trie error value if c is not in the range 0..U+10FFFF.
*
* @param trie (const UCPTrie *, in) the trie; must have type UCPTRIE_TYPE_FAST
* @param dataAccess UCPTRIE_16, UCPTRIE_32, or UCPTRIE_8 according to the tries value width
* @param c (UChar32, in) the input code point
* @return The code point's trie value.
* @stable ICU 63
*/
#define UCPTRIE_FAST_GET(trie, dataAccess, c) dataAccess(trie, _UCPTRIE_CP_INDEX(trie, 0xffff, c))
/**
* Returns a 16-bit trie value for a code point, with range checking.
* Returns the trie error value if c is not in the range U+0000..U+10FFFF.
*
* @param trie (const UCPTrie *, in) the trie; must have type UCPTRIE_TYPE_SMALL
* @param dataAccess UCPTRIE_16, UCPTRIE_32, or UCPTRIE_8 according to the tries value width
* @param c (UChar32, in) the input code point
* @return The code point's trie value.
* @stable ICU 63
*/
#define UCPTRIE_SMALL_GET(trie, dataAccess, c) \
dataAccess(trie, _UCPTRIE_CP_INDEX(trie, UCPTRIE_SMALL_MAX, c))
/**
* UTF-16: Reads the next code point (UChar32 c, out), post-increments src,
* and gets a value from the trie.
* Sets the trie error value if c is an unpaired surrogate.
*
* @param trie (const UCPTrie *, in) the trie; must have type UCPTRIE_TYPE_FAST
* @param dataAccess UCPTRIE_16, UCPTRIE_32, or UCPTRIE_8 according to the tries value width
* @param src (const UChar *, in/out) the source text pointer
* @param limit (const UChar *, in) the limit pointer for the text, or NULL if NUL-terminated
* @param c (UChar32, out) variable for the code point
* @param result (out) variable for the trie lookup result
* @stable ICU 63
*/
#define UCPTRIE_FAST_U16_NEXT(trie, dataAccess, src, limit, c, result) UPRV_BLOCK_MACRO_BEGIN { \
(c) = *(src)++; \
int32_t __index; \
if (!U16_IS_SURROGATE(c)) { \
__index = _UCPTRIE_FAST_INDEX(trie, c); \
} else { \
uint16_t __c2; \
if (U16_IS_SURROGATE_LEAD(c) && (src) != (limit) && U16_IS_TRAIL(__c2 = *(src))) { \
++(src); \
(c) = U16_GET_SUPPLEMENTARY((c), __c2); \
__index = _UCPTRIE_SMALL_INDEX(trie, c); \
} else { \
__index = (trie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; \
} \
} \
(result) = dataAccess(trie, __index); \
} UPRV_BLOCK_MACRO_END
/**
* UTF-16: Reads the previous code point (UChar32 c, out), pre-decrements src,
* and gets a value from the trie.
* Sets the trie error value if c is an unpaired surrogate.
*
* @param trie (const UCPTrie *, in) the trie; must have type UCPTRIE_TYPE_FAST
* @param dataAccess UCPTRIE_16, UCPTRIE_32, or UCPTRIE_8 according to the tries value width
* @param start (const UChar *, in) the start pointer for the text
* @param src (const UChar *, in/out) the source text pointer
* @param c (UChar32, out) variable for the code point
* @param result (out) variable for the trie lookup result
* @stable ICU 63
*/
#define UCPTRIE_FAST_U16_PREV(trie, dataAccess, start, src, c, result) UPRV_BLOCK_MACRO_BEGIN { \
(c) = *--(src); \
int32_t __index; \
if (!U16_IS_SURROGATE(c)) { \
__index = _UCPTRIE_FAST_INDEX(trie, c); \
} else { \
uint16_t __c2; \
if (U16_IS_SURROGATE_TRAIL(c) && (src) != (start) && U16_IS_LEAD(__c2 = *((src) - 1))) { \
--(src); \
(c) = U16_GET_SUPPLEMENTARY(__c2, (c)); \
__index = _UCPTRIE_SMALL_INDEX(trie, c); \
} else { \
__index = (trie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; \
} \
} \
(result) = dataAccess(trie, __index); \
} UPRV_BLOCK_MACRO_END
/**
* UTF-8: Post-increments src and gets a value from the trie.
* Sets the trie error value for an ill-formed byte sequence.
*
* Unlike UCPTRIE_FAST_U16_NEXT() this UTF-8 macro does not provide the code point
* because it would be more work to do so and is often not needed.
* If the trie value differs from the error value, then the byte sequence is well-formed,
* and the code point can be assembled without revalidation.
*
* @param trie (const UCPTrie *, in) the trie; must have type UCPTRIE_TYPE_FAST
* @param dataAccess UCPTRIE_16, UCPTRIE_32, or UCPTRIE_8 according to the tries value width
* @param src (const char *, in/out) the source text pointer
* @param limit (const char *, in) the limit pointer for the text (must not be NULL)
* @param result (out) variable for the trie lookup result
* @stable ICU 63
*/
#define UCPTRIE_FAST_U8_NEXT(trie, dataAccess, src, limit, result) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __lead = (uint8_t)*(src)++; \
if (!U8_IS_SINGLE(__lead)) { \
uint8_t __t1, __t2, __t3; \
if ((src) != (limit) && \
(__lead >= 0xe0 ? \
__lead < 0xf0 ? /* U+0800..U+FFFF except surrogates */ \
U8_LEAD3_T1_BITS[__lead &= 0xf] & (1 << ((__t1 = *(src)) >> 5)) && \
++(src) != (limit) && (__t2 = *(src) - 0x80) <= 0x3f && \
(__lead = ((int32_t)(trie)->index[(__lead << 6) + (__t1 & 0x3f)]) + __t2, 1) \
: /* U+10000..U+10FFFF */ \
(__lead -= 0xf0) <= 4 && \
U8_LEAD4_T1_BITS[(__t1 = *(src)) >> 4] & (1 << __lead) && \
(__lead = (__lead << 6) | (__t1 & 0x3f), ++(src) != (limit)) && \
(__t2 = *(src) - 0x80) <= 0x3f && \
++(src) != (limit) && (__t3 = *(src) - 0x80) <= 0x3f && \
(__lead = __lead >= (trie)->shifted12HighStart ? \
(trie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : \
ucptrie_internalSmallU8Index((trie), __lead, __t2, __t3), 1) \
: /* U+0080..U+07FF */ \
__lead >= 0xc2 && (__t1 = *(src) - 0x80) <= 0x3f && \
(__lead = (int32_t)(trie)->index[__lead & 0x1f] + __t1, 1))) { \
++(src); \
} else { \
__lead = (trie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET; /* ill-formed*/ \
} \
} \
(result) = dataAccess(trie, __lead); \
} UPRV_BLOCK_MACRO_END
/**
* UTF-8: Pre-decrements src and gets a value from the trie.
* Sets the trie error value for an ill-formed byte sequence.
*
* Unlike UCPTRIE_FAST_U16_PREV() this UTF-8 macro does not provide the code point
* because it would be more work to do so and is often not needed.
* If the trie value differs from the error value, then the byte sequence is well-formed,
* and the code point can be assembled without revalidation.
*
* @param trie (const UCPTrie *, in) the trie; must have type UCPTRIE_TYPE_FAST
* @param dataAccess UCPTRIE_16, UCPTRIE_32, or UCPTRIE_8 according to the tries value width
* @param start (const char *, in) the start pointer for the text
* @param src (const char *, in/out) the source text pointer
* @param result (out) variable for the trie lookup result
* @stable ICU 63
*/
#define UCPTRIE_FAST_U8_PREV(trie, dataAccess, start, src, result) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __index = (uint8_t)*--(src); \
if (!U8_IS_SINGLE(__index)) { \
__index = ucptrie_internalU8PrevIndex((trie), __index, (const uint8_t *)(start), \
(const uint8_t *)(src)); \
(src) -= __index & 7; \
__index >>= 3; \
} \
(result) = dataAccess(trie, __index); \
} UPRV_BLOCK_MACRO_END
/**
* Returns a trie value for an ASCII code point, without range checking.
*
* @param trie (const UCPTrie *, in) the trie (of either fast or small type)
* @param dataAccess UCPTRIE_16, UCPTRIE_32, or UCPTRIE_8 according to the tries value width
* @param c (UChar32, in) the input code point; must be U+0000..U+007F
* @return The ASCII code point's trie value.
* @stable ICU 63
*/
#define UCPTRIE_ASCII_GET(trie, dataAccess, c) dataAccess(trie, c)
/**
* Returns a trie value for a BMP code point (U+0000..U+FFFF), without range checking.
* Can be used to look up a value for a UTF-16 code unit if other parts of
* the string processing check for surrogates.
*
* @param trie (const UCPTrie *, in) the trie; must have type UCPTRIE_TYPE_FAST
* @param dataAccess UCPTRIE_16, UCPTRIE_32, or UCPTRIE_8 according to the tries value width
* @param c (UChar32, in) the input code point, must be U+0000..U+FFFF
* @return The BMP code point's trie value.
* @stable ICU 63
*/
#define UCPTRIE_FAST_BMP_GET(trie, dataAccess, c) dataAccess(trie, _UCPTRIE_FAST_INDEX(trie, c))
/**
* Returns a trie value for a supplementary code point (U+10000..U+10FFFF),
* without range checking.
*
* @param trie (const UCPTrie *, in) the trie; must have type UCPTRIE_TYPE_FAST
* @param dataAccess UCPTRIE_16, UCPTRIE_32, or UCPTRIE_8 according to the tries value width
* @param c (UChar32, in) the input code point, must be U+10000..U+10FFFF
* @return The supplementary code point's trie value.
* @stable ICU 63
*/
#define UCPTRIE_FAST_SUPP_GET(trie, dataAccess, c) dataAccess(trie, _UCPTRIE_SMALL_INDEX(trie, c))
/* Internal definitions ----------------------------------------------------- */
#ifndef U_IN_DOXYGEN
/**
* Internal implementation constants.
* These are needed for the API macros, but users should not use these directly.
* @internal
*/
enum {
/** @internal */
UCPTRIE_FAST_SHIFT = 6,
/** Number of entries in a data block for code points below the fast limit. 64=0x40 @internal */
UCPTRIE_FAST_DATA_BLOCK_LENGTH = 1 << UCPTRIE_FAST_SHIFT,
/** Mask for getting the lower bits for the in-fast-data-block offset. @internal */
UCPTRIE_FAST_DATA_MASK = UCPTRIE_FAST_DATA_BLOCK_LENGTH - 1,
/** @internal */
UCPTRIE_SMALL_MAX = 0xfff,
/**
* Offset from dataLength (to be subtracted) for fetching the
* value returned for out-of-range code points and ill-formed UTF-8/16.
* @internal
*/
UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET = 1,
/**
* Offset from dataLength (to be subtracted) for fetching the
* value returned for code points highStart..U+10FFFF.
* @internal
*/
UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET = 2
};
/* Internal functions and macros -------------------------------------------- */
// Do not conditionalize with #ifndef U_HIDE_INTERNAL_API, needed for public API
/** @internal */
U_CAPI int32_t U_EXPORT2
ucptrie_internalSmallIndex(const UCPTrie *trie, UChar32 c);
/** @internal */
U_CAPI int32_t U_EXPORT2
ucptrie_internalSmallU8Index(const UCPTrie *trie, int32_t lt1, uint8_t t2, uint8_t t3);
/**
* Internal function for part of the UCPTRIE_FAST_U8_PREVxx() macro implementations.
* Do not call directly.
* @internal
*/
U_CAPI int32_t U_EXPORT2
ucptrie_internalU8PrevIndex(const UCPTrie *trie, UChar32 c,
const uint8_t *start, const uint8_t *src);
/** Internal trie getter for a code point below the fast limit. Returns the data index. @internal */
#define _UCPTRIE_FAST_INDEX(trie, c) \
((int32_t)(trie)->index[(c) >> UCPTRIE_FAST_SHIFT] + ((c) & UCPTRIE_FAST_DATA_MASK))
/** Internal trie getter for a code point at or above the fast limit. Returns the data index. @internal */
#define _UCPTRIE_SMALL_INDEX(trie, c) \
((c) >= (trie)->highStart ? \
(trie)->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET : \
ucptrie_internalSmallIndex(trie, c))
/**
* Internal trie getter for a code point, with checking that c is in U+0000..10FFFF.
* Returns the data index.
* @internal
*/
#define _UCPTRIE_CP_INDEX(trie, fastMax, c) \
((uint32_t)(c) <= (uint32_t)(fastMax) ? \
_UCPTRIE_FAST_INDEX(trie, c) : \
(uint32_t)(c) <= 0x10ffff ? \
_UCPTRIE_SMALL_INDEX(trie, c) : \
(trie)->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET)
U_CDECL_END
#endif // U_IN_DOXYGEN
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalUCPTriePointer
* "Smart pointer" class, closes a UCPTrie via ucptrie_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @stable ICU 63
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalUCPTriePointer, UCPTrie, ucptrie_close);
U_NAMESPACE_END
#endif // U_SHOW_CPLUSPLUS_API
#endif

466
thirdparty/icu4c/common/unicode/ucurr.h vendored Normal file
View File

@@ -0,0 +1,466 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (c) 2002-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#ifndef _UCURR_H_
#define _UCURR_H_
#include "unicode/utypes.h"
#include "unicode/uenum.h"
/**
* \file
* \brief C API: Encapsulates information about a currency.
*
* The ucurr API encapsulates information about a currency, as defined by
* ISO 4217. A currency is represented by a 3-character string
* containing its ISO 4217 code. This API can return various data
* necessary the proper display of a currency:
*
* <ul><li>A display symbol, for a specific locale
* <li>The number of fraction digits to display
* <li>A rounding increment
* </ul>
*
* The <tt>DecimalFormat</tt> class uses these data to display
* currencies.
* @author Alan Liu
* @since ICU 2.2
*/
#if !UCONFIG_NO_FORMATTING
/**
* Currency Usage used for Decimal Format
* @stable ICU 54
*/
enum UCurrencyUsage {
/**
* a setting to specify currency usage which determines currency digit
* and rounding for standard usage, for example: "50.00 NT$"
* used as DEFAULT value
* @stable ICU 54
*/
UCURR_USAGE_STANDARD=0,
/**
* a setting to specify currency usage which determines currency digit
* and rounding for cash usage, for example: "50 NT$"
* @stable ICU 54
*/
UCURR_USAGE_CASH=1,
#ifndef U_HIDE_DEPRECATED_API
/**
* One higher than the last enum UCurrencyUsage constant.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
UCURR_USAGE_COUNT=2
#endif // U_HIDE_DEPRECATED_API
};
/** Currency Usage used for Decimal Format */
typedef enum UCurrencyUsage UCurrencyUsage;
/**
* Finds a currency code for the given locale.
* @param locale the locale for which to retrieve a currency code.
* Currency can be specified by the "currency" keyword
* in which case it overrides the default currency code
* @param buff fill in buffer. Can be NULL for preflighting.
* @param buffCapacity capacity of the fill in buffer. Can be 0 for
* preflighting. If it is non-zero, the buff parameter
* must not be NULL.
* @param ec error code
* @return length of the currency string. It should always be 3. If 0,
* currency couldn't be found or the input values are
* invalid.
* @stable ICU 2.8
*/
U_CAPI int32_t U_EXPORT2
ucurr_forLocale(const char* locale,
UChar* buff,
int32_t buffCapacity,
UErrorCode* ec);
/**
* Selector constants for ucurr_getName().
*
* @see ucurr_getName
* @stable ICU 2.6
*/
typedef enum UCurrNameStyle {
/**
* Selector for ucurr_getName indicating a symbolic name for a
* currency, such as "$" for USD.
* @stable ICU 2.6
*/
UCURR_SYMBOL_NAME,
/**
* Selector for ucurr_getName indicating the long name for a
* currency, such as "US Dollar" for USD.
* @stable ICU 2.6
*/
UCURR_LONG_NAME,
/**
* Selector for getName() indicating the narrow currency symbol.
* The narrow currency symbol is similar to the regular currency
* symbol, but it always takes the shortest form: for example,
* "$" instead of "US$" for USD in en-CA.
*
* @stable ICU 61
*/
UCURR_NARROW_SYMBOL_NAME,
/**
* Selector for getName() indicating the formal currency symbol.
* The formal currency symbol is similar to the regular currency
* symbol, but it always takes the form used in formal settings
* such as banking; for example, "NT$" instead of "$" for TWD in zh-TW.
*
* @stable ICU 68
*/
UCURR_FORMAL_SYMBOL_NAME,
/**
* Selector for getName() indicating the variant currency symbol.
* The variant symbol for a currency is an alternative symbol
* that is not necessarily as widely used as the regular symbol.
*
* @stable ICU 68
*/
UCURR_VARIANT_SYMBOL_NAME
} UCurrNameStyle;
#if !UCONFIG_NO_SERVICE
/**
* @stable ICU 2.6
*/
typedef const void* UCurrRegistryKey;
/**
* Register an (existing) ISO 4217 currency code for the given locale.
* Only the country code and the two variants EURO and PRE_EURO are
* recognized.
* @param isoCode the three-letter ISO 4217 currency code
* @param locale the locale for which to register this currency code
* @param status the in/out status code
* @return a registry key that can be used to unregister this currency code, or NULL
* if there was an error.
* @stable ICU 2.6
*/
U_CAPI UCurrRegistryKey U_EXPORT2
ucurr_register(const UChar* isoCode,
const char* locale,
UErrorCode* status);
/**
* Unregister the previously-registered currency definitions using the
* URegistryKey returned from ucurr_register. Key becomes invalid after
* a successful call and should not be used again. Any currency
* that might have been hidden by the original ucurr_register call is
* restored.
* @param key the registry key returned by a previous call to ucurr_register
* @param status the in/out status code, no special meanings are assigned
* @return true if the currency for this key was successfully unregistered
* @stable ICU 2.6
*/
U_CAPI UBool U_EXPORT2
ucurr_unregister(UCurrRegistryKey key, UErrorCode* status);
#endif /* UCONFIG_NO_SERVICE */
/**
* Returns the display name for the given currency in the
* given locale. For example, the display name for the USD
* currency object in the en_US locale is "$".
* @param currency null-terminated 3-letter ISO 4217 code
* @param locale locale in which to display currency
* @param nameStyle selector for which kind of name to return
* @param isChoiceFormat always set to false, or can be NULL;
* display names are static strings;
* since ICU 4.4, ChoiceFormat patterns are no longer supported
* @param len fill-in parameter to receive length of result
* @param ec error code
* @return pointer to display string of 'len' UChars. If the resource
* data contains no entry for 'currency', then 'currency' itself is
* returned.
* @stable ICU 2.6
*/
U_CAPI const UChar* U_EXPORT2
ucurr_getName(const UChar* currency,
const char* locale,
UCurrNameStyle nameStyle,
UBool* isChoiceFormat,
int32_t* len,
UErrorCode* ec);
/**
* Returns the plural name for the given currency in the
* given locale. For example, the plural name for the USD
* currency object in the en_US locale is "US dollar" or "US dollars".
* @param currency null-terminated 3-letter ISO 4217 code
* @param locale locale in which to display currency
* @param isChoiceFormat always set to false, or can be NULL;
* display names are static strings;
* since ICU 4.4, ChoiceFormat patterns are no longer supported
* @param pluralCount plural count
* @param len fill-in parameter to receive length of result
* @param ec error code
* @return pointer to display string of 'len' UChars. If the resource
* data contains no entry for 'currency', then 'currency' itself is
* returned.
* @stable ICU 4.2
*/
U_CAPI const UChar* U_EXPORT2
ucurr_getPluralName(const UChar* currency,
const char* locale,
UBool* isChoiceFormat,
const char* pluralCount,
int32_t* len,
UErrorCode* ec);
/**
* Returns the number of the number of fraction digits that should
* be displayed for the given currency.
* This is equivalent to ucurr_getDefaultFractionDigitsForUsage(currency,UCURR_USAGE_STANDARD,ec);
*
* Important: The number of fraction digits for a given currency is NOT
* guaranteed to be constant across versions of ICU or CLDR. For example,
* do NOT use this value as a mechanism for deciding the magnitude used
* to store currency values in a database. You should use this value for
* display purposes only.
*
* @param currency null-terminated 3-letter ISO 4217 code
* @param ec input-output error code
* @return a non-negative number of fraction digits to be
* displayed, or 0 if there is an error
* @stable ICU 3.0
*/
U_CAPI int32_t U_EXPORT2
ucurr_getDefaultFractionDigits(const UChar* currency,
UErrorCode* ec);
/**
* Returns the number of the number of fraction digits that should
* be displayed for the given currency with usage.
*
* Important: The number of fraction digits for a given currency is NOT
* guaranteed to be constant across versions of ICU or CLDR. For example,
* do NOT use this value as a mechanism for deciding the magnitude used
* to store currency values in a database. You should use this value for
* display purposes only.
*
* @param currency null-terminated 3-letter ISO 4217 code
* @param usage enum usage for the currency
* @param ec input-output error code
* @return a non-negative number of fraction digits to be
* displayed, or 0 if there is an error
* @stable ICU 54
*/
U_CAPI int32_t U_EXPORT2
ucurr_getDefaultFractionDigitsForUsage(const UChar* currency,
const UCurrencyUsage usage,
UErrorCode* ec);
/**
* Returns the rounding increment for the given currency, or 0.0 if no
* rounding is done by the currency.
* This is equivalent to ucurr_getRoundingIncrementForUsage(currency,UCURR_USAGE_STANDARD,ec);
* @param currency null-terminated 3-letter ISO 4217 code
* @param ec input-output error code
* @return the non-negative rounding increment, or 0.0 if none,
* or 0.0 if there is an error
* @stable ICU 3.0
*/
U_CAPI double U_EXPORT2
ucurr_getRoundingIncrement(const UChar* currency,
UErrorCode* ec);
/**
* Returns the rounding increment for the given currency, or 0.0 if no
* rounding is done by the currency given usage.
* @param currency null-terminated 3-letter ISO 4217 code
* @param usage enum usage for the currency
* @param ec input-output error code
* @return the non-negative rounding increment, or 0.0 if none,
* or 0.0 if there is an error
* @stable ICU 54
*/
U_CAPI double U_EXPORT2
ucurr_getRoundingIncrementForUsage(const UChar* currency,
const UCurrencyUsage usage,
UErrorCode* ec);
/**
* Selector constants for ucurr_openCurrencies().
*
* @see ucurr_openCurrencies
* @stable ICU 3.2
*/
typedef enum UCurrCurrencyType {
/**
* Select all ISO-4217 currency codes.
* @stable ICU 3.2
*/
UCURR_ALL = INT32_MAX,
/**
* Select only ISO-4217 commonly used currency codes.
* These currencies can be found in common use, and they usually have
* bank notes or coins associated with the currency code.
* This does not include fund codes, precious metals and other
* various ISO-4217 codes limited to special financial products.
* @stable ICU 3.2
*/
UCURR_COMMON = 1,
/**
* Select ISO-4217 uncommon currency codes.
* These codes respresent fund codes, precious metals and other
* various ISO-4217 codes limited to special financial products.
* A fund code is a monetary resource associated with a currency.
* @stable ICU 3.2
*/
UCURR_UNCOMMON = 2,
/**
* Select only deprecated ISO-4217 codes.
* These codes are no longer in general public use.
* @stable ICU 3.2
*/
UCURR_DEPRECATED = 4,
/**
* Select only non-deprecated ISO-4217 codes.
* These codes are in general public use.
* @stable ICU 3.2
*/
UCURR_NON_DEPRECATED = 8
} UCurrCurrencyType;
/**
* Provides a UEnumeration object for listing ISO-4217 codes.
* @param currType You can use one of several UCurrCurrencyType values for this
* variable. You can also | (or) them together to get a specific list of
* currencies. Most people will want to use the (UCURR_COMMON|UCURR_NON_DEPRECATED) value to
* get a list of current currencies.
* @param pErrorCode Error code
* @stable ICU 3.2
*/
U_CAPI UEnumeration * U_EXPORT2
ucurr_openISOCurrencies(uint32_t currType, UErrorCode *pErrorCode);
/**
* Queries if the given ISO 4217 3-letter code is available on the specified date range.
*
* Note: For checking availability of a currency on a specific date, specify the date on both 'from' and 'to'
*
* When 'from' is U_DATE_MIN and 'to' is U_DATE_MAX, this method checks if the specified currency is available any time.
* If 'from' and 'to' are same UDate value, this method checks if the specified currency is available on that date.
*
* @param isoCode
* The ISO 4217 3-letter code.
*
* @param from
* The lower bound of the date range, inclusive. When 'from' is U_DATE_MIN, check the availability
* of the currency any date before 'to'
*
* @param to
* The upper bound of the date range, inclusive. When 'to' is U_DATE_MAX, check the availability of
* the currency any date after 'from'
*
* @param errorCode
* ICU error code
*
* @return true if the given ISO 4217 3-letter code is supported on the specified date range.
*
* @stable ICU 4.8
*/
U_CAPI UBool U_EXPORT2
ucurr_isAvailable(const UChar* isoCode,
UDate from,
UDate to,
UErrorCode* errorCode);
/**
* Finds the number of valid currency codes for the
* given locale and date.
* @param locale the locale for which to retrieve the
* currency count.
* @param date the date for which to retrieve the
* currency count for the given locale.
* @param ec error code
* @return the number of currency codes for the
* given locale and date. If 0, currency
* codes couldn't be found for the input
* values are invalid.
* @stable ICU 4.0
*/
U_CAPI int32_t U_EXPORT2
ucurr_countCurrencies(const char* locale,
UDate date,
UErrorCode* ec);
/**
* Finds a currency code for the given locale and date
* @param locale the locale for which to retrieve a currency code.
* Currency can be specified by the "currency" keyword
* in which case it overrides the default currency code
* @param date the date for which to retrieve a currency code for
* the given locale.
* @param index the index within the available list of currency codes
* for the given locale on the given date.
* @param buff fill in buffer. Can be NULL for preflighting.
* @param buffCapacity capacity of the fill in buffer. Can be 0 for
* preflighting. If it is non-zero, the buff parameter
* must not be NULL.
* @param ec error code
* @return length of the currency string. It should always be 3.
* If 0, currency couldn't be found or the input values are
* invalid.
* @stable ICU 4.0
*/
U_CAPI int32_t U_EXPORT2
ucurr_forLocaleAndDate(const char* locale,
UDate date,
int32_t index,
UChar* buff,
int32_t buffCapacity,
UErrorCode* ec);
/**
* Given a key and a locale, returns an array of string values in a preferred
* order that would make a difference. These are all and only those values where
* the open (creation) of the service with the locale formed from the input locale
* plus input keyword and that value has different behavior than creation with the
* input locale alone.
* @param key one of the keys supported by this service. For now, only
* "currency" is supported.
* @param locale the locale
* @param commonlyUsed if set to true it will return only commonly used values
* with the given locale in preferred order. Otherwise,
* it will return all the available values for the locale.
* @param status error status
* @return a string enumeration over keyword values for the given key and the locale.
* @stable ICU 4.2
*/
U_CAPI UEnumeration* U_EXPORT2
ucurr_getKeywordValuesForLocale(const char* key,
const char* locale,
UBool commonlyUsed,
UErrorCode* status);
/**
* Returns the ISO 4217 numeric code for the currency.
* <p>Note: If the ISO 4217 numeric code is not assigned for the currency or
* the currency is unknown, this function returns 0.
*
* @param currency null-terminated 3-letter ISO 4217 code
* @return The ISO 4217 numeric code of the currency
* @stable ICU 49
*/
U_CAPI int32_t U_EXPORT2
ucurr_getNumericCode(const UChar* currency);
#endif /* #if !UCONFIG_NO_FORMATTING */
#endif

440
thirdparty/icu4c/common/unicode/udata.h vendored Normal file
View File

@@ -0,0 +1,440 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1999-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: udata.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999oct25
* created by: Markus W. Scherer
*/
#ifndef __UDATA_H__
#define __UDATA_H__
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/localpointer.h"
#endif // U_SHOW_CPLUSPLUS_API
U_CDECL_BEGIN
/**
* \file
* \brief C API: Data loading interface
*
* <h2>Information about data loading interface</h2>
*
* This API is used to find and efficiently load data for ICU and applications
* using ICU. It provides an abstract interface that specifies a data type and
* name to find and load the data. Normally this API is used by other ICU APIs
* to load required data out of the ICU data library, but it can be used to
* load data out of other places.
*
* See the User Guide Data Management chapter.
*/
#ifndef U_HIDE_INTERNAL_API
/**
* Character used to separate package names from tree names
* @internal ICU 3.0
*/
#define U_TREE_SEPARATOR '-'
/**
* String used to separate package names from tree names
* @internal ICU 3.0
*/
#define U_TREE_SEPARATOR_STRING "-"
/**
* Character used to separate parts of entry names
* @internal ICU 3.0
*/
#define U_TREE_ENTRY_SEP_CHAR '/'
/**
* String used to separate parts of entry names
* @internal ICU 3.0
*/
#define U_TREE_ENTRY_SEP_STRING "/"
/**
* Alias for standard ICU data
* @internal ICU 3.0
*/
#define U_ICUDATA_ALIAS "ICUDATA"
#endif /* U_HIDE_INTERNAL_API */
/**
* UDataInfo contains the properties about the requested data.
* This is meta data.
*
* <p>This structure may grow in the future, indicated by the
* <code>size</code> field.</p>
*
* <p>ICU data must be at least 8-aligned, and should be 16-aligned.
* The UDataInfo struct begins 4 bytes after the start of the data item,
* so it is 4-aligned.
*
* <p>The platform data property fields help determine if a data
* file can be efficiently used on a given machine.
* The particular fields are of importance only if the data
* is affected by the properties - if there is integer data
* with word sizes > 1 byte, char* text, or UChar* text.</p>
*
* <p>The implementation for the <code>udata_open[Choice]()</code>
* functions may reject data based on the value in <code>isBigEndian</code>.
* No other field is used by the <code>udata</code> API implementation.</p>
*
* <p>The <code>dataFormat</code> may be used to identify
* the kind of data, e.g. a converter table.</p>
*
* <p>The <code>formatVersion</code> field should be used to
* make sure that the format can be interpreted.
* It may be a good idea to check only for the one or two highest
* of the version elements to allow the data memory to
* get more or somewhat rearranged contents, for as long
* as the using code can still interpret the older contents.</p>
*
* <p>The <code>dataVersion</code> field is intended to be a
* common place to store the source version of the data;
* for data from the Unicode character database, this could
* reflect the Unicode version.</p>
*
* @stable ICU 2.0
*/
typedef struct {
/** sizeof(UDataInfo)
* @stable ICU 2.0 */
uint16_t size;
/** unused, set to 0
* @stable ICU 2.0*/
uint16_t reservedWord;
/* platform data properties */
/** 0 for little-endian machine, 1 for big-endian
* @stable ICU 2.0 */
uint8_t isBigEndian;
/** see U_CHARSET_FAMILY values in utypes.h
* @stable ICU 2.0*/
uint8_t charsetFamily;
/** sizeof(UChar), one of { 1, 2, 4 }
* @stable ICU 2.0*/
uint8_t sizeofUChar;
/** unused, set to 0
* @stable ICU 2.0*/
uint8_t reservedByte;
/** data format identifier
* @stable ICU 2.0*/
uint8_t dataFormat[4];
/** versions: [0] major [1] minor [2] milli [3] micro
* @stable ICU 2.0*/
uint8_t formatVersion[4];
/** versions: [0] major [1] minor [2] milli [3] micro
* @stable ICU 2.0*/
uint8_t dataVersion[4];
} UDataInfo;
/* API for reading data -----------------------------------------------------*/
/**
* Forward declaration of the data memory type.
* @stable ICU 2.0
*/
typedef struct UDataMemory UDataMemory;
/**
* Callback function for udata_openChoice().
* @param context parameter passed into <code>udata_openChoice()</code>.
* @param type The type of the data as passed into <code>udata_openChoice()</code>.
* It may be <code>NULL</code>.
* @param name The name of the data as passed into <code>udata_openChoice()</code>.
* @param pInfo A pointer to the <code>UDataInfo</code> structure
* of data that has been loaded and will be returned
* by <code>udata_openChoice()</code> if this function
* returns <code>true</code>.
* @return true if the current data memory is acceptable
* @stable ICU 2.0
*/
typedef UBool U_CALLCONV
UDataMemoryIsAcceptable(void *context,
const char *type, const char *name,
const UDataInfo *pInfo);
/**
* Convenience function.
* This function works the same as <code>udata_openChoice</code>
* except that any data that matches the type and name
* is assumed to be acceptable.
* @param path Specifies an absolute path and/or a basename for the
* finding of the data in the file system.
* <code>NULL</code> for ICU data.
* @param type A string that specifies the type of data to be loaded.
* For example, resource bundles are loaded with type "res",
* conversion tables with type "cnv".
* This may be <code>NULL</code> or empty.
* @param name A string that specifies the name of the data.
* @param pErrorCode An ICU UErrorCode parameter. It must not be <code>NULL</code>.
* @return A pointer (handle) to a data memory object, or <code>NULL</code>
* if an error occurs. Call <code>udata_getMemory()</code>
* to get a pointer to the actual data.
*
* @see udata_openChoice
* @stable ICU 2.0
*/
U_CAPI UDataMemory * U_EXPORT2
udata_open(const char *path, const char *type, const char *name,
UErrorCode *pErrorCode);
/**
* Data loading function.
* This function is used to find and load efficiently data for
* ICU and applications using ICU.
* It provides an abstract interface that allows to specify a data
* type and name to find and load the data.
*
* <p>The implementation depends on platform properties and user preferences
* and may involve loading shared libraries (DLLs), mapping
* files into memory, or fopen()/fread() files.
* It may also involve using static memory or database queries etc.
* Several or all data items may be combined into one entity
* (DLL, memory-mappable file).</p>
*
* <p>The data is always preceded by a header that includes
* a <code>UDataInfo</code> structure.
* The caller's <code>isAcceptable()</code> function is called to make
* sure that the data is useful. It may be called several times if it
* rejects the data and there is more than one location with data
* matching the type and name.</p>
*
* <p>If <code>path==NULL</code>, then ICU data is loaded.
* Otherwise, it is separated into a basename and a basename-less directory string.
* The basename is used as the data package name, and the directory is
* logically prepended to the ICU data directory string.</p>
*
* <p>For details about ICU data loading see the User Guide
* Data Management chapter. (https://unicode-org.github.io/icu/userguide/icu_data/)</p>
*
* @param path Specifies an absolute path and/or a basename for the
* finding of the data in the file system.
* <code>NULL</code> for ICU data.
* @param type A string that specifies the type of data to be loaded.
* For example, resource bundles are loaded with type "res",
* conversion tables with type "cnv".
* This may be <code>NULL</code> or empty.
* @param name A string that specifies the name of the data.
* @param isAcceptable This function is called to verify that loaded data
* is useful for the client code. If it returns false
* for all data items, then <code>udata_openChoice()</code>
* will return with an error.
* @param context Arbitrary parameter to be passed into isAcceptable.
* @param pErrorCode An ICU UErrorCode parameter. It must not be <code>NULL</code>.
* @return A pointer (handle) to a data memory object, or <code>NULL</code>
* if an error occurs. Call <code>udata_getMemory()</code>
* to get a pointer to the actual data.
* @stable ICU 2.0
*/
U_CAPI UDataMemory * U_EXPORT2
udata_openChoice(const char *path, const char *type, const char *name,
UDataMemoryIsAcceptable *isAcceptable, void *context,
UErrorCode *pErrorCode);
/**
* Close the data memory.
* This function must be called to allow the system to
* release resources associated with this data memory.
* @param pData The pointer to data memory object
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2
udata_close(UDataMemory *pData);
/**
* Get the pointer to the actual data inside the data memory.
* The data is read-only.
*
* ICU data must be at least 8-aligned, and should be 16-aligned.
*
* @param pData The pointer to data memory object
* @stable ICU 2.0
*/
U_CAPI const void * U_EXPORT2
udata_getMemory(UDataMemory *pData);
/**
* Get the information from the data memory header.
* This allows to get access to the header containing
* platform data properties etc. which is not part of
* the data itself and can therefore not be accessed
* via the pointer that <code>udata_getMemory()</code> returns.
*
* @param pData pointer to the data memory object
* @param pInfo pointer to a UDataInfo object;
* its <code>size</code> field must be set correctly,
* typically to <code>sizeof(UDataInfo)</code>.
*
* <code>*pInfo</code> will be filled with the UDataInfo structure
* in the data memory object. If this structure is smaller than
* <code>pInfo->size</code>, then the <code>size</code> will be
* adjusted and only part of the structure will be filled.
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2
udata_getInfo(UDataMemory *pData, UDataInfo *pInfo);
/**
* This function bypasses the normal ICU data loading process and
* allows you to force ICU's system data to come out of a user-specified
* area in memory.
*
* ICU data must be at least 8-aligned, and should be 16-aligned.
* See https://unicode-org.github.io/icu/userguide/icu_data
*
* The format of this data is that of the icu common data file, as is
* generated by the pkgdata tool with mode=common or mode=dll.
* You can read in a whole common mode file and pass the address to the start of the
* data, or (with the appropriate link options) pass in the pointer to
* the data that has been loaded from a dll by the operating system,
* as shown in this code:
*
* extern const char U_IMPORT U_ICUDATA_ENTRY_POINT [];
* // U_ICUDATA_ENTRY_POINT is same as entry point specified to pkgdata tool
* UErrorCode status = U_ZERO_ERROR;
*
* udata_setCommonData(&U_ICUDATA_ENTRY_POINT, &status);
*
* It is important that the declaration be as above. The entry point
* must not be declared as an extern void*.
*
* Starting with ICU 4.4, it is possible to set several data packages,
* one per call to this function.
* udata_open() will look for data in the multiple data packages in the order
* in which they were set.
* The position of the linked-in or default-name ICU .data package in the
* search list depends on when the first data item is loaded that is not contained
* in the already explicitly set packages.
* If data was loaded implicitly before the first call to this function
* (for example, via opening a converter, constructing a UnicodeString
* from default-codepage data, using formatting or collation APIs, etc.),
* then the default data will be first in the list.
*
* This function has no effect on application (non ICU) data. See udata_setAppData()
* for similar functionality for application data.
*
* @param data pointer to ICU common data
* @param err outgoing error status <code>U_USING_DEFAULT_WARNING, U_UNSUPPORTED_ERROR</code>
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2
udata_setCommonData(const void *data, UErrorCode *err);
/**
* This function bypasses the normal ICU data loading process for application-specific
* data and allows you to force the it to come out of a user-specified
* pointer.
*
* ICU data must be at least 8-aligned, and should be 16-aligned.
* See https://unicode-org.github.io/icu/userguide/icu_data
*
* The format of this data is that of the icu common data file, like 'icudt26l.dat'
* or the corresponding shared library (DLL) file.
* The application must read in or otherwise construct an image of the data and then
* pass the address of it to this function.
*
*
* Warning: setAppData will set a U_USING_DEFAULT_WARNING code if
* data with the specified path that has already been opened, or
* if setAppData with the same path has already been called.
* Any such calls to setAppData will have no effect.
*
*
* @param packageName the package name by which the application will refer
* to (open) this data
* @param data pointer to the data
* @param err outgoing error status <code>U_USING_DEFAULT_WARNING, U_UNSUPPORTED_ERROR</code>
* @see udata_setCommonData
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2
udata_setAppData(const char *packageName, const void *data, UErrorCode *err);
/**
* Possible settings for udata_setFileAccess()
* @see udata_setFileAccess
* @stable ICU 3.4
*/
typedef enum UDataFileAccess {
/** ICU looks for data in single files first, then in packages. (default) @stable ICU 3.4 */
UDATA_FILES_FIRST,
/** An alias for the default access mode. @stable ICU 3.4 */
UDATA_DEFAULT_ACCESS = UDATA_FILES_FIRST,
/** ICU only loads data from packages, not from single files. @stable ICU 3.4 */
UDATA_ONLY_PACKAGES,
/** ICU loads data from packages first, and only from single files
if the data cannot be found in a package. @stable ICU 3.4 */
UDATA_PACKAGES_FIRST,
/** ICU does not access the file system for data loading. @stable ICU 3.4 */
UDATA_NO_FILES,
#ifndef U_HIDE_DEPRECATED_API
/**
* Number of real UDataFileAccess values.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
UDATA_FILE_ACCESS_COUNT
#endif // U_HIDE_DEPRECATED_API
} UDataFileAccess;
/**
* This function may be called to control how ICU loads data. It must be called
* before any ICU data is loaded, including application data loaded with
* ures/ResourceBundle or udata APIs. This function is not multithread safe.
* The results of calling it while other threads are loading data are undefined.
* @param access The type of file access to be used
* @param status Error code.
* @see UDataFileAccess
* @stable ICU 3.4
*/
U_CAPI void U_EXPORT2
udata_setFileAccess(UDataFileAccess access, UErrorCode *status);
U_CDECL_END
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalUDataMemoryPointer
* "Smart pointer" class, closes a UDataMemory via udata_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @stable ICU 4.4
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalUDataMemoryPointer, UDataMemory, udata_close);
U_NAMESPACE_END
#endif // U_SHOW_CPLUSPLUS_API
#endif

View File

@@ -0,0 +1,173 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*****************************************************************************************
* Copyright (C) 2014-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*****************************************************************************************
*/
#ifndef UDISPLAYCONTEXT_H
#define UDISPLAYCONTEXT_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_FORMATTING
/**
* \file
* \brief C API: Display context types (enum values)
*/
/**
* Display context types, for getting values of a particular setting.
* Note, the specific numeric values are internal and may change.
* @stable ICU 51
*/
enum UDisplayContextType {
/**
* Type to retrieve the dialect handling setting, e.g.
* UDISPCTX_STANDARD_NAMES or UDISPCTX_DIALECT_NAMES.
* @stable ICU 51
*/
UDISPCTX_TYPE_DIALECT_HANDLING = 0,
/**
* Type to retrieve the capitalization context setting, e.g.
* UDISPCTX_CAPITALIZATION_NONE, UDISPCTX_CAPITALIZATION_FOR_MIDDLE_OF_SENTENCE,
* UDISPCTX_CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE, etc.
* @stable ICU 51
*/
UDISPCTX_TYPE_CAPITALIZATION = 1,
/**
* Type to retrieve the display length setting, e.g.
* UDISPCTX_LENGTH_FULL, UDISPCTX_LENGTH_SHORT.
* @stable ICU 54
*/
UDISPCTX_TYPE_DISPLAY_LENGTH = 2,
/**
* Type to retrieve the substitute handling setting, e.g.
* UDISPCTX_SUBSTITUTE, UDISPCTX_NO_SUBSTITUTE.
* @stable ICU 58
*/
UDISPCTX_TYPE_SUBSTITUTE_HANDLING = 3
};
/**
* @stable ICU 51
*/
typedef enum UDisplayContextType UDisplayContextType;
/**
* Display context settings.
* Note, the specific numeric values are internal and may change.
* @stable ICU 51
*/
enum UDisplayContext {
/**
* ================================
* DIALECT_HANDLING can be set to one of UDISPCTX_STANDARD_NAMES or
* UDISPCTX_DIALECT_NAMES. Use UDisplayContextType UDISPCTX_TYPE_DIALECT_HANDLING
* to get the value.
*/
/**
* A possible setting for DIALECT_HANDLING:
* use standard names when generating a locale name,
* e.g. en_GB displays as 'English (United Kingdom)'.
* @stable ICU 51
*/
UDISPCTX_STANDARD_NAMES = (UDISPCTX_TYPE_DIALECT_HANDLING<<8) + 0,
/**
* A possible setting for DIALECT_HANDLING:
* use dialect names, when generating a locale name,
* e.g. en_GB displays as 'British English'.
* @stable ICU 51
*/
UDISPCTX_DIALECT_NAMES = (UDISPCTX_TYPE_DIALECT_HANDLING<<8) + 1,
/**
* ================================
* CAPITALIZATION can be set to one of UDISPCTX_CAPITALIZATION_NONE,
* UDISPCTX_CAPITALIZATION_FOR_MIDDLE_OF_SENTENCE,
* UDISPCTX_CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE,
* UDISPCTX_CAPITALIZATION_FOR_UI_LIST_OR_MENU, or
* UDISPCTX_CAPITALIZATION_FOR_STANDALONE.
* Use UDisplayContextType UDISPCTX_TYPE_CAPITALIZATION to get the value.
*/
/**
* The capitalization context to be used is unknown (this is the default value).
* @stable ICU 51
*/
UDISPCTX_CAPITALIZATION_NONE = (UDISPCTX_TYPE_CAPITALIZATION<<8) + 0,
/**
* The capitalization context if a date, date symbol or display name is to be
* formatted with capitalization appropriate for the middle of a sentence.
* @stable ICU 51
*/
UDISPCTX_CAPITALIZATION_FOR_MIDDLE_OF_SENTENCE = (UDISPCTX_TYPE_CAPITALIZATION<<8) + 1,
/**
* The capitalization context if a date, date symbol or display name is to be
* formatted with capitalization appropriate for the beginning of a sentence.
* @stable ICU 51
*/
UDISPCTX_CAPITALIZATION_FOR_BEGINNING_OF_SENTENCE = (UDISPCTX_TYPE_CAPITALIZATION<<8) + 2,
/**
* The capitalization context if a date, date symbol or display name is to be
* formatted with capitalization appropriate for a user-interface list or menu item.
* @stable ICU 51
*/
UDISPCTX_CAPITALIZATION_FOR_UI_LIST_OR_MENU = (UDISPCTX_TYPE_CAPITALIZATION<<8) + 3,
/**
* The capitalization context if a date, date symbol or display name is to be
* formatted with capitalization appropriate for stand-alone usage such as an
* isolated name on a calendar page.
* @stable ICU 51
*/
UDISPCTX_CAPITALIZATION_FOR_STANDALONE = (UDISPCTX_TYPE_CAPITALIZATION<<8) + 4,
/**
* ================================
* DISPLAY_LENGTH can be set to one of UDISPCTX_LENGTH_FULL or
* UDISPCTX_LENGTH_SHORT. Use UDisplayContextType UDISPCTX_TYPE_DISPLAY_LENGTH
* to get the value.
*/
/**
* A possible setting for DISPLAY_LENGTH:
* use full names when generating a locale name,
* e.g. "United States" for US.
* @stable ICU 54
*/
UDISPCTX_LENGTH_FULL = (UDISPCTX_TYPE_DISPLAY_LENGTH<<8) + 0,
/**
* A possible setting for DISPLAY_LENGTH:
* use short names when generating a locale name,
* e.g. "U.S." for US.
* @stable ICU 54
*/
UDISPCTX_LENGTH_SHORT = (UDISPCTX_TYPE_DISPLAY_LENGTH<<8) + 1,
/**
* ================================
* SUBSTITUTE_HANDLING can be set to one of UDISPCTX_SUBSTITUTE or
* UDISPCTX_NO_SUBSTITUTE. Use UDisplayContextType UDISPCTX_TYPE_SUBSTITUTE_HANDLING
* to get the value.
*/
/**
* A possible setting for SUBSTITUTE_HANDLING:
* Returns a fallback value (e.g., the input code) when no data is available.
* This is the default value.
* @stable ICU 58
*/
UDISPCTX_SUBSTITUTE = (UDISPCTX_TYPE_SUBSTITUTE_HANDLING<<8) + 0,
/**
* A possible setting for SUBSTITUTE_HANDLING:
* Returns a null value with error code set to U_ILLEGAL_ARGUMENT_ERROR when no
* data is available.
* @stable ICU 58
*/
UDISPCTX_NO_SUBSTITUTE = (UDISPCTX_TYPE_SUBSTITUTE_HANDLING<<8) + 1
};
/**
* @stable ICU 51
*/
typedef enum UDisplayContext UDisplayContext;
#endif /* #if !UCONFIG_NO_FORMATTING */
#endif

209
thirdparty/icu4c/common/unicode/uenum.h vendored Normal file
View File

@@ -0,0 +1,209 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2002-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uenum.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:2
*
* created on: 2002jul08
* created by: Vladimir Weinstein
*/
#ifndef __UENUM_H
#define __UENUM_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/localpointer.h"
U_NAMESPACE_BEGIN
class StringEnumeration;
U_NAMESPACE_END
#endif // U_SHOW_CPLUSPLUS_API
/**
* \file
* \brief C API: String Enumeration
*/
/**
* An enumeration object.
* For usage in C programs.
* @stable ICU 2.2
*/
struct UEnumeration;
/** structure representing an enumeration object instance @stable ICU 2.2 */
typedef struct UEnumeration UEnumeration;
/**
* Disposes of resources in use by the iterator. If en is NULL,
* does nothing. After this call, any char* or UChar* pointer
* returned by uenum_unext() or uenum_next() is invalid.
* @param en UEnumeration structure pointer
* @stable ICU 2.2
*/
U_CAPI void U_EXPORT2
uenum_close(UEnumeration* en);
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalUEnumerationPointer
* "Smart pointer" class, closes a UEnumeration via uenum_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @stable ICU 4.4
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalUEnumerationPointer, UEnumeration, uenum_close);
U_NAMESPACE_END
#endif
/**
* Returns the number of elements that the iterator traverses. If
* the iterator is out-of-sync with its service, status is set to
* U_ENUM_OUT_OF_SYNC_ERROR.
* This is a convenience function. It can end up being very
* expensive as all the items might have to be pre-fetched (depending
* on the type of data being traversed). Use with caution and only
* when necessary.
* @param en UEnumeration structure pointer
* @param status error code, can be U_ENUM_OUT_OF_SYNC_ERROR if the
* iterator is out of sync.
* @return number of elements in the iterator
* @stable ICU 2.2
*/
U_CAPI int32_t U_EXPORT2
uenum_count(UEnumeration* en, UErrorCode* status);
/**
* Returns the next element in the iterator's list. If there are
* no more elements, returns NULL. If the iterator is out-of-sync
* with its service, status is set to U_ENUM_OUT_OF_SYNC_ERROR and
* NULL is returned. If the native service string is a char* string,
* it is converted to UChar* with the invariant converter.
* The result is terminated by (UChar)0.
* @param en the iterator object
* @param resultLength pointer to receive the length of the result
* (not including the terminating \\0).
* If the pointer is NULL it is ignored.
* @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if
* the iterator is out of sync with its service.
* @return a pointer to the string. The string will be
* zero-terminated. The return pointer is owned by this iterator
* and must not be deleted by the caller. The pointer is valid
* until the next call to any uenum_... method, including
* uenum_next() or uenum_unext(). When all strings have been
* traversed, returns NULL.
* @stable ICU 2.2
*/
U_CAPI const UChar* U_EXPORT2
uenum_unext(UEnumeration* en,
int32_t* resultLength,
UErrorCode* status);
/**
* Returns the next element in the iterator's list. If there are
* no more elements, returns NULL. If the iterator is out-of-sync
* with its service, status is set to U_ENUM_OUT_OF_SYNC_ERROR and
* NULL is returned. If the native service string is a UChar*
* string, it is converted to char* with the invariant converter.
* The result is terminated by (char)0. If the conversion fails
* (because a character cannot be converted) then status is set to
* U_INVARIANT_CONVERSION_ERROR and the return value is undefined
* (but non-NULL).
* @param en the iterator object
* @param resultLength pointer to receive the length of the result
* (not including the terminating \\0).
* If the pointer is NULL it is ignored.
* @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if
* the iterator is out of sync with its service. Set to
* U_INVARIANT_CONVERSION_ERROR if the underlying native string is
* UChar* and conversion to char* with the invariant converter
* fails. This error pertains only to current string, so iteration
* might be able to continue successfully.
* @return a pointer to the string. The string will be
* zero-terminated. The return pointer is owned by this iterator
* and must not be deleted by the caller. The pointer is valid
* until the next call to any uenum_... method, including
* uenum_next() or uenum_unext(). When all strings have been
* traversed, returns NULL.
* @stable ICU 2.2
*/
U_CAPI const char* U_EXPORT2
uenum_next(UEnumeration* en,
int32_t* resultLength,
UErrorCode* status);
/**
* Resets the iterator to the current list of service IDs. This
* re-establishes sync with the service and rewinds the iterator
* to start at the first element.
* @param en the iterator object
* @param status the error code, set to U_ENUM_OUT_OF_SYNC_ERROR if
* the iterator is out of sync with its service.
* @stable ICU 2.2
*/
U_CAPI void U_EXPORT2
uenum_reset(UEnumeration* en, UErrorCode* status);
#if U_SHOW_CPLUSPLUS_API
/**
* Given a StringEnumeration, wrap it in a UEnumeration. The
* StringEnumeration is adopted; after this call, the caller must not
* delete it (regardless of error status).
* @param adopted the C++ StringEnumeration to be wrapped in a UEnumeration.
* @param ec the error code.
* @return a UEnumeration wrapping the adopted StringEnumeration.
* @stable ICU 4.2
*/
U_CAPI UEnumeration* U_EXPORT2
uenum_openFromStringEnumeration(icu::StringEnumeration* adopted, UErrorCode* ec);
#endif
/**
* Given an array of const UChar* strings, return a UEnumeration. String pointers from 0..count-1 must not be null.
* Do not free or modify either the string array or the characters it points to until this object has been destroyed with uenum_close.
* \snippet test/cintltst/uenumtst.c uenum_openUCharStringsEnumeration
* @param strings array of const UChar* strings (each null terminated). All storage is owned by the caller.
* @param count length of the array
* @param ec error code
* @return the new UEnumeration object. Caller is responsible for calling uenum_close to free memory.
* @see uenum_close
* @stable ICU 50
*/
U_CAPI UEnumeration* U_EXPORT2
uenum_openUCharStringsEnumeration(const UChar* const strings[], int32_t count,
UErrorCode* ec);
/**
* Given an array of const char* strings (invariant chars only), return a UEnumeration. String pointers from 0..count-1 must not be null.
* Do not free or modify either the string array or the characters it points to until this object has been destroyed with uenum_close.
* \snippet test/cintltst/uenumtst.c uenum_openCharStringsEnumeration
* @param strings array of char* strings (each null terminated). All storage is owned by the caller.
* @param count length of the array
* @param ec error code
* @return the new UEnumeration object. Caller is responsible for calling uenum_close to free memory
* @see uenum_close
* @stable ICU 50
*/
U_CAPI UEnumeration* U_EXPORT2
uenum_openCharStringsEnumeration(const char* const strings[], int32_t count,
UErrorCode* ec);
#endif

794
thirdparty/icu4c/common/unicode/uidna.h vendored Normal file
View File

@@ -0,0 +1,794 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2003-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uidna.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003feb1
* created by: Ram Viswanadha
*/
#ifndef __UIDNA_H__
#define __UIDNA_H__
#include "unicode/utypes.h"
#if !UCONFIG_NO_IDNA
#include <stdbool.h>
#include "unicode/parseerr.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/localpointer.h"
#endif // U_SHOW_CPLUSPLUS_API
/**
* \file
* \brief C API: Internationalizing Domain Names in Applications (IDNA)
*
* IDNA2008 is implemented according to UTS #46, see the IDNA C++ class in idna.h.
*
* The C API functions which do take a UIDNA * service object pointer
* implement UTS #46 and IDNA2008.
*
* IDNA2003 is obsolete.
* The C API functions which do not take a service object pointer
* implement IDNA2003. They are all deprecated.
*/
/*
* IDNA option bit set values.
*/
enum {
/**
* Default options value: UTS #46 nontransitional processing.
* For use in static worker and factory methods.
*
* Since ICU 76, this is the same as
* UIDNA_NONTRANSITIONAL_TO_ASCII | UIDNA_NONTRANSITIONAL_TO_UNICODE,
* corresponding to Unicode 15.1 UTS #46 deprecating transitional processing.
* (These options are ignored by the IDNA2003 implementation.)
*
* Before ICU 76, this constant did not set any of the options.
*
* @stable ICU 2.6
*/
UIDNA_DEFAULT=0x30,
#ifndef U_HIDE_DEPRECATED_API
/**
* Option to allow unassigned code points in domain names and labels.
* For use in static worker and factory methods.
* <p>This option is ignored by the UTS46 implementation.
* (UTS #46 disallows unassigned code points.)
* @deprecated ICU 55 Use UTS #46 instead via uidna_openUTS46() or class IDNA.
*/
UIDNA_ALLOW_UNASSIGNED=1,
#endif /* U_HIDE_DEPRECATED_API */
/**
* Option to check whether the input conforms to the STD3 ASCII rules,
* for example the restriction of labels to LDH characters
* (ASCII Letters, Digits and Hyphen-Minus).
* For use in static worker and factory methods.
* @stable ICU 2.6
*/
UIDNA_USE_STD3_RULES=2,
/**
* IDNA option to check for whether the input conforms to the BiDi rules.
* For use in static worker and factory methods.
* <p>This option is ignored by the IDNA2003 implementation.
* (IDNA2003 always performs a BiDi check.)
* @stable ICU 4.6
*/
UIDNA_CHECK_BIDI=4,
/**
* IDNA option to check for whether the input conforms to the CONTEXTJ rules.
* For use in static worker and factory methods.
* <p>This option is ignored by the IDNA2003 implementation.
* (The CONTEXTJ check is new in IDNA2008.)
* @stable ICU 4.6
*/
UIDNA_CHECK_CONTEXTJ=8,
/**
* IDNA option for nontransitional processing in ToASCII().
* For use in static worker and factory methods.
*
* <p>By default, ToASCII() uses transitional processing.
* Unicode 15.1 UTS #46 deprecated transitional processing.
*
* <p>This option is ignored by the IDNA2003 implementation.
* (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.)
* @stable ICU 4.6
* @see UIDNA_DEFAULT
*/
UIDNA_NONTRANSITIONAL_TO_ASCII=0x10,
/**
* IDNA option for nontransitional processing in ToUnicode().
* For use in static worker and factory methods.
*
* <p>By default, ToUnicode() uses transitional processing.
* Unicode 15.1 UTS #46 deprecated transitional processing.
*
* <p>This option is ignored by the IDNA2003 implementation.
* (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.)
* @stable ICU 4.6
* @see UIDNA_DEFAULT
*/
UIDNA_NONTRANSITIONAL_TO_UNICODE=0x20,
/**
* IDNA option to check for whether the input conforms to the CONTEXTO rules.
* For use in static worker and factory methods.
* <p>This option is ignored by the IDNA2003 implementation.
* (The CONTEXTO check is new in IDNA2008.)
* <p>This is for use by registries for IDNA2008 conformance.
* UTS #46 does not require the CONTEXTO check.
* @stable ICU 49
*/
UIDNA_CHECK_CONTEXTO=0x40
};
/**
* Opaque C service object type for the new IDNA API.
* @stable ICU 4.6
*/
struct UIDNA;
typedef struct UIDNA UIDNA; /**< C typedef for struct UIDNA. @stable ICU 4.6 */
/**
* Returns a UIDNA instance which implements UTS #46.
* Returns an unmodifiable instance, owned by the caller.
* Cache it for multiple operations, and uidna_close() it when done.
* The instance is thread-safe, that is, it can be used concurrently.
*
* For details about the UTS #46 implementation see the IDNA C++ class in idna.h.
*
* @param options Bit set to modify the processing and error checking.
* These should include UIDNA_DEFAULT, or
* UIDNA_NONTRANSITIONAL_TO_ASCII | UIDNA_NONTRANSITIONAL_TO_UNICODE.
* See option bit set values in uidna.h.
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the UTS #46 UIDNA instance, if successful
* @stable ICU 4.6
*/
U_CAPI UIDNA * U_EXPORT2
uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode);
/**
* Closes a UIDNA instance.
* @param idna UIDNA instance to be closed
* @stable ICU 4.6
*/
U_CAPI void U_EXPORT2
uidna_close(UIDNA *idna);
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalUIDNAPointer
* "Smart pointer" class, closes a UIDNA via uidna_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @stable ICU 4.6
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalUIDNAPointer, UIDNA, uidna_close);
U_NAMESPACE_END
#endif
/**
* Output container for IDNA processing errors.
* Initialize with UIDNA_INFO_INITIALIZER:
* \code
* UIDNAInfo info = UIDNA_INFO_INITIALIZER;
* int32_t length = uidna_nameToASCII(..., &info, &errorCode);
* if(U_SUCCESS(errorCode) && info.errors!=0) { ... }
* \endcode
* @stable ICU 4.6
*/
typedef struct UIDNAInfo {
/** sizeof(UIDNAInfo) @stable ICU 4.6 */
int16_t size;
/**
* Set to true if transitional and nontransitional processing produce different results.
* For details see C++ IDNAInfo::isTransitionalDifferent().
* @stable ICU 4.6
*/
UBool isTransitionalDifferent;
UBool reservedB3; /**< Reserved field, do not use. @internal */
/**
* Bit set indicating IDNA processing errors. 0 if no errors.
* See UIDNA_ERROR_... constants.
* @stable ICU 4.6
*/
uint32_t errors;
int32_t reservedI2; /**< Reserved field, do not use. @internal */
int32_t reservedI3; /**< Reserved field, do not use. @internal */
} UIDNAInfo;
/**
* Static initializer for a UIDNAInfo struct.
* @stable ICU 4.6
*/
#define UIDNA_INFO_INITIALIZER { \
(int16_t)sizeof(UIDNAInfo), \
false, false, \
0, 0, 0 }
/**
* Converts a single domain name label into its ASCII form for DNS lookup.
* If any processing step fails, then pInfo->errors will be non-zero and
* the result might not be an ASCII string.
* The label might be modified according to the types of errors.
* Labels with severe errors will be left in (or turned into) their Unicode form.
*
* The UErrorCode indicates an error only in exceptional cases,
* such as a U_MEMORY_ALLOCATION_ERROR.
*
* @param idna UIDNA instance
* @param label Input domain name label
* @param length Label length, or -1 if NUL-terminated
* @param dest Destination string buffer
* @param capacity Destination buffer capacity
* @param pInfo Output container of IDNA processing details.
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return destination string length
* @stable ICU 4.6
*/
U_CAPI int32_t U_EXPORT2
uidna_labelToASCII(const UIDNA *idna,
const UChar *label, int32_t length,
UChar *dest, int32_t capacity,
UIDNAInfo *pInfo, UErrorCode *pErrorCode);
/**
* Converts a single domain name label into its Unicode form for human-readable display.
* If any processing step fails, then pInfo->errors will be non-zero.
* The label might be modified according to the types of errors.
*
* The UErrorCode indicates an error only in exceptional cases,
* such as a U_MEMORY_ALLOCATION_ERROR.
*
* @param idna UIDNA instance
* @param label Input domain name label
* @param length Label length, or -1 if NUL-terminated
* @param dest Destination string buffer
* @param capacity Destination buffer capacity
* @param pInfo Output container of IDNA processing details.
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return destination string length
* @stable ICU 4.6
*/
U_CAPI int32_t U_EXPORT2
uidna_labelToUnicode(const UIDNA *idna,
const UChar *label, int32_t length,
UChar *dest, int32_t capacity,
UIDNAInfo *pInfo, UErrorCode *pErrorCode);
/**
* Converts a whole domain name into its ASCII form for DNS lookup.
* If any processing step fails, then pInfo->errors will be non-zero and
* the result might not be an ASCII string.
* The domain name might be modified according to the types of errors.
* Labels with severe errors will be left in (or turned into) their Unicode form.
*
* The UErrorCode indicates an error only in exceptional cases,
* such as a U_MEMORY_ALLOCATION_ERROR.
*
* @param idna UIDNA instance
* @param name Input domain name
* @param length Domain name length, or -1 if NUL-terminated
* @param dest Destination string buffer
* @param capacity Destination buffer capacity
* @param pInfo Output container of IDNA processing details.
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return destination string length
* @stable ICU 4.6
*/
U_CAPI int32_t U_EXPORT2
uidna_nameToASCII(const UIDNA *idna,
const UChar *name, int32_t length,
UChar *dest, int32_t capacity,
UIDNAInfo *pInfo, UErrorCode *pErrorCode);
/**
* Converts a whole domain name into its Unicode form for human-readable display.
* If any processing step fails, then pInfo->errors will be non-zero.
* The domain name might be modified according to the types of errors.
*
* The UErrorCode indicates an error only in exceptional cases,
* such as a U_MEMORY_ALLOCATION_ERROR.
*
* @param idna UIDNA instance
* @param name Input domain name
* @param length Domain name length, or -1 if NUL-terminated
* @param dest Destination string buffer
* @param capacity Destination buffer capacity
* @param pInfo Output container of IDNA processing details.
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return destination string length
* @stable ICU 4.6
*/
U_CAPI int32_t U_EXPORT2
uidna_nameToUnicode(const UIDNA *idna,
const UChar *name, int32_t length,
UChar *dest, int32_t capacity,
UIDNAInfo *pInfo, UErrorCode *pErrorCode);
/* UTF-8 versions of the processing methods --------------------------------- */
/**
* Converts a single domain name label into its ASCII form for DNS lookup.
* UTF-8 version of uidna_labelToASCII(), same behavior.
*
* @param idna UIDNA instance
* @param label Input domain name label
* @param length Label length, or -1 if NUL-terminated
* @param dest Destination string buffer
* @param capacity Destination buffer capacity
* @param pInfo Output container of IDNA processing details.
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return destination string length
* @stable ICU 4.6
*/
U_CAPI int32_t U_EXPORT2
uidna_labelToASCII_UTF8(const UIDNA *idna,
const char *label, int32_t length,
char *dest, int32_t capacity,
UIDNAInfo *pInfo, UErrorCode *pErrorCode);
/**
* Converts a single domain name label into its Unicode form for human-readable display.
* UTF-8 version of uidna_labelToUnicode(), same behavior.
*
* @param idna UIDNA instance
* @param label Input domain name label
* @param length Label length, or -1 if NUL-terminated
* @param dest Destination string buffer
* @param capacity Destination buffer capacity
* @param pInfo Output container of IDNA processing details.
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return destination string length
* @stable ICU 4.6
*/
U_CAPI int32_t U_EXPORT2
uidna_labelToUnicodeUTF8(const UIDNA *idna,
const char *label, int32_t length,
char *dest, int32_t capacity,
UIDNAInfo *pInfo, UErrorCode *pErrorCode);
/**
* Converts a whole domain name into its ASCII form for DNS lookup.
* UTF-8 version of uidna_nameToASCII(), same behavior.
*
* @param idna UIDNA instance
* @param name Input domain name
* @param length Domain name length, or -1 if NUL-terminated
* @param dest Destination string buffer
* @param capacity Destination buffer capacity
* @param pInfo Output container of IDNA processing details.
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return destination string length
* @stable ICU 4.6
*/
U_CAPI int32_t U_EXPORT2
uidna_nameToASCII_UTF8(const UIDNA *idna,
const char *name, int32_t length,
char *dest, int32_t capacity,
UIDNAInfo *pInfo, UErrorCode *pErrorCode);
/**
* Converts a whole domain name into its Unicode form for human-readable display.
* UTF-8 version of uidna_nameToUnicode(), same behavior.
*
* @param idna UIDNA instance
* @param name Input domain name
* @param length Domain name length, or -1 if NUL-terminated
* @param dest Destination string buffer
* @param capacity Destination buffer capacity
* @param pInfo Output container of IDNA processing details.
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return destination string length
* @stable ICU 4.6
*/
U_CAPI int32_t U_EXPORT2
uidna_nameToUnicodeUTF8(const UIDNA *idna,
const char *name, int32_t length,
char *dest, int32_t capacity,
UIDNAInfo *pInfo, UErrorCode *pErrorCode);
/*
* IDNA error bit set values.
* When a domain name or label fails a processing step or does not meet the
* validity criteria, then one or more of these error bits are set.
*/
enum {
/**
* A non-final domain name label (or the whole domain name) is empty.
* @stable ICU 4.6
*/
UIDNA_ERROR_EMPTY_LABEL=1,
/**
* A domain name label is longer than 63 bytes.
* (See STD13/RFC1034 3.1. Name space specifications and terminology.)
* This is only checked in ToASCII operations, and only if the output label is all-ASCII.
* @stable ICU 4.6
*/
UIDNA_ERROR_LABEL_TOO_LONG=2,
/**
* A domain name is longer than 255 bytes in its storage form.
* (See STD13/RFC1034 3.1. Name space specifications and terminology.)
* This is only checked in ToASCII operations, and only if the output domain name is all-ASCII.
* @stable ICU 4.6
*/
UIDNA_ERROR_DOMAIN_NAME_TOO_LONG=4,
/**
* A label starts with a hyphen-minus ('-').
* @stable ICU 4.6
*/
UIDNA_ERROR_LEADING_HYPHEN=8,
/**
* A label ends with a hyphen-minus ('-').
* @stable ICU 4.6
*/
UIDNA_ERROR_TRAILING_HYPHEN=0x10,
/**
* A label contains hyphen-minus ('-') in the third and fourth positions.
* @stable ICU 4.6
*/
UIDNA_ERROR_HYPHEN_3_4=0x20,
/**
* A label starts with a combining mark.
* @stable ICU 4.6
*/
UIDNA_ERROR_LEADING_COMBINING_MARK=0x40,
/**
* A label or domain name contains disallowed characters.
* @stable ICU 4.6
*/
UIDNA_ERROR_DISALLOWED=0x80,
/**
* A label starts with "xn--" but does not contain valid Punycode.
* That is, an xn-- label failed Punycode decoding.
* @stable ICU 4.6
*/
UIDNA_ERROR_PUNYCODE=0x100,
/**
* A label contains a dot=full stop.
* This can occur in an input string for a single-label function.
* @stable ICU 4.6
*/
UIDNA_ERROR_LABEL_HAS_DOT=0x200,
/**
* An ACE label does not contain a valid label string.
* The label was successfully ACE (Punycode) decoded but the resulting
* string had severe validation errors. For example,
* it might contain characters that are not allowed in ACE labels,
* or it might not be normalized.
* @stable ICU 4.6
*/
UIDNA_ERROR_INVALID_ACE_LABEL=0x400,
/**
* A label does not meet the IDNA BiDi requirements (for right-to-left characters).
* @stable ICU 4.6
*/
UIDNA_ERROR_BIDI=0x800,
/**
* A label does not meet the IDNA CONTEXTJ requirements.
* @stable ICU 4.6
*/
UIDNA_ERROR_CONTEXTJ=0x1000,
/**
* A label does not meet the IDNA CONTEXTO requirements for punctuation characters.
* Some punctuation characters "Would otherwise have been DISALLOWED"
* but are allowed in certain contexts. (RFC 5892)
* @stable ICU 49
*/
UIDNA_ERROR_CONTEXTO_PUNCTUATION=0x2000,
/**
* A label does not meet the IDNA CONTEXTO requirements for digits.
* Arabic-Indic Digits (U+066x) must not be mixed with Extended Arabic-Indic Digits (U+06Fx).
* @stable ICU 49
*/
UIDNA_ERROR_CONTEXTO_DIGITS=0x4000
};
#ifndef U_HIDE_DEPRECATED_API
/* IDNA2003 API ------------------------------------------------------------- */
/**
* IDNA2003: This function implements the ToASCII operation as defined in the IDNA RFC.
* This operation is done on <b>single labels</b> before sending it to something that expects
* ASCII names. A label is an individual part of a domain name. Labels are usually
* separated by dots; e.g. "www.example.com" is composed of 3 labels "www","example", and "com".
*
* IDNA2003 API Overview:
*
* The uidna_ API implements the IDNA protocol as defined in the IDNA RFC
* (http://www.ietf.org/rfc/rfc3490.txt).
* The RFC defines 2 operations: ToASCII and ToUnicode. Domain name labels
* containing non-ASCII code points are processed by the
* ToASCII operation before passing it to resolver libraries. Domain names
* that are obtained from resolver libraries are processed by the
* ToUnicode operation before displaying the domain name to the user.
* IDNA requires that implementations process input strings with Nameprep
* (http://www.ietf.org/rfc/rfc3491.txt),
* which is a profile of Stringprep (http://www.ietf.org/rfc/rfc3454.txt),
* and then with Punycode (http://www.ietf.org/rfc/rfc3492.txt).
* Implementations of IDNA MUST fully implement Nameprep and Punycode;
* neither Nameprep nor Punycode are optional.
* The input and output of ToASCII and ToUnicode operations are Unicode
* and are designed to be chainable, i.e., applying ToASCII or ToUnicode operations
* multiple times to an input string will yield the same result as applying the operation
* once.
* ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string)
* ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string).
*
* @param src Input UChar array containing label in Unicode.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output UChar array with ASCII (ACE encoded) label.
* @param destCapacity Size of dest.
* @param options A bit set of options:
*
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
* U_UNASSIGNED_ERROR error code.
*
* - UIDNA_ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points.
*
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param parseError Pointer to UParseError struct to receive information on position
* of error if an error is encountered. Can be NULL.
* @param status ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
* @return The length of the result string, if successful - or in case of a buffer overflow,
* in which case it will be greater than destCapacity.
* @deprecated ICU 55 Use UTS #46 instead via uidna_openUTS46() or class IDNA.
*/
U_DEPRECATED int32_t U_EXPORT2
uidna_toASCII(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status);
/**
* IDNA2003: This function implements the ToUnicode operation as defined in the IDNA RFC.
* This operation is done on <b>single labels</b> before sending it to something that expects
* Unicode names. A label is an individual part of a domain name. Labels are usually
* separated by dots; for e.g. "www.example.com" is composed of 3 labels "www","example", and "com".
*
* @param src Input UChar array containing ASCII (ACE encoded) label.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output Converted UChar array containing Unicode equivalent of label.
* @param destCapacity Size of dest.
* @param options A bit set of options:
*
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
* U_UNASSIGNED_ERROR error code.
*
* - UIDNA_ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points. <b> Note: </b> This option is
* required on toUnicode operation because the RFC mandates
* verification of decoded ACE input by applying toASCII and comparing
* its output with source
*
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param parseError Pointer to UParseError struct to receive information on position
* of error if an error is encountered. Can be NULL.
* @param status ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
* @return The length of the result string, if successful - or in case of a buffer overflow,
* in which case it will be greater than destCapacity.
* @deprecated ICU 55 Use UTS #46 instead via uidna_openUTS46() or class IDNA.
*/
U_DEPRECATED int32_t U_EXPORT2
uidna_toUnicode(const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status);
/**
* IDNA2003: Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
* This operation is done on complete domain names, e.g: "www.example.com".
* It is important to note that this operation can fail. If it fails, then the input
* domain name cannot be used as an Internationalized Domain Name and the application
* should have methods defined to deal with the failure.
*
* <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
* into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
* and then convert. This function does not offer that level of granularity. The options once
* set will apply to all labels in the domain name
*
* @param src Input UChar array containing IDN in Unicode.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output UChar array with ASCII (ACE encoded) IDN.
* @param destCapacity Size of dest.
* @param options A bit set of options:
*
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
* U_UNASSIGNED_CODE_POINT_FOUND error code.
*
* - UIDNA_ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points.
*
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param parseError Pointer to UParseError struct to receive information on position
* of error if an error is encountered. Can be NULL.
* @param status ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
* @return The length of the result string, if successful - or in case of a buffer overflow,
* in which case it will be greater than destCapacity.
* @deprecated ICU 55 Use UTS #46 instead via uidna_openUTS46() or class IDNA.
*/
U_DEPRECATED int32_t U_EXPORT2
uidna_IDNToASCII( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status);
/**
* IDNA2003: Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
* This operation is done on complete domain names, e.g: "www.example.com".
*
* <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
* into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each,
* and then convert. This function does not offer that level of granularity. The options once
* set will apply to all labels in the domain name
*
* @param src Input UChar array containing IDN in ASCII (ACE encoded) form.
* @param srcLength Number of UChars in src, or -1 if NUL-terminated.
* @param dest Output UChar array containing Unicode equivalent of source IDN.
* @param destCapacity Size of dest.
* @param options A bit set of options:
*
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
* U_UNASSIGNED_CODE_POINT_FOUND error code.
*
* - UIDNA_ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points.
*
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param parseError Pointer to UParseError struct to receive information on position
* of error if an error is encountered. Can be NULL.
* @param status ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
* @return The length of the result string, if successful - or in case of a buffer overflow,
* in which case it will be greater than destCapacity.
* @deprecated ICU 55 Use UTS #46 instead via uidna_openUTS46() or class IDNA.
*/
U_DEPRECATED int32_t U_EXPORT2
uidna_IDNToUnicode( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status);
/**
* IDNA2003: Compare two IDN strings for equivalence.
* This function splits the domain names into labels and compares them.
* According to IDN RFC, whenever two labels are compared, they are
* considered equal if and only if their ASCII forms (obtained by
* applying toASCII) match using an case-insensitive ASCII comparison.
* Two domain names are considered a match if and only if all labels
* match regardless of whether label separators match.
*
* @param s1 First source string.
* @param length1 Length of first source string, or -1 if NUL-terminated.
*
* @param s2 Second source string.
* @param length2 Length of second source string, or -1 if NUL-terminated.
* @param options A bit set of options:
*
* - UIDNA_DEFAULT Use default options, i.e., do not process unassigned code points
* and do not use STD3 ASCII rules
* If unassigned code points are found the operation fails with
* U_UNASSIGNED_CODE_POINT_FOUND error code.
*
* - UIDNA_ALLOW_UNASSIGNED Unassigned values can be converted to ASCII for query operations
* If this option is set, the unassigned code points are in the input
* are treated as normal Unicode code points.
*
* - UIDNA_USE_STD3_RULES Use STD3 ASCII rules for host name syntax restrictions
* If this option is set and the input does not satisfy STD3 rules,
* the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
*
* @param status ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return <0 or 0 or >0 as usual for string comparisons
* @deprecated ICU 55 Use UTS #46 instead via uidna_openUTS46() or class IDNA.
*/
U_DEPRECATED int32_t U_EXPORT2
uidna_compare( const UChar *s1, int32_t length1,
const UChar *s2, int32_t length2,
int32_t options,
UErrorCode* status);
#endif /* U_HIDE_DEPRECATED_API */
#endif /* #if !UCONFIG_NO_IDNA */
#endif

709
thirdparty/icu4c/common/unicode/uiter.h vendored Normal file
View File

@@ -0,0 +1,709 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2002-2011 International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uiter.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002jan18
* created by: Markus W. Scherer
*/
#ifndef __UITER_H__
#define __UITER_H__
/**
* \file
* \brief C API: Unicode Character Iteration
*
* @see UCharIterator
*/
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
class CharacterIterator;
class Replaceable;
U_NAMESPACE_END
#endif
U_CDECL_BEGIN
struct UCharIterator;
typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
/**
* Origin constants for UCharIterator.getIndex() and UCharIterator.move().
* @see UCharIteratorMove
* @see UCharIterator
* @stable ICU 2.1
*/
typedef enum UCharIteratorOrigin {
UITER_START, UITER_CURRENT, UITER_LIMIT, UITER_ZERO, UITER_LENGTH
} UCharIteratorOrigin;
/** Constants for UCharIterator. @stable ICU 2.6 */
enum {
/**
* Constant value that may be returned by UCharIteratorMove
* indicating that the final UTF-16 index is not known, but that the move succeeded.
* This can occur when moving relative to limit or length, or
* when moving relative to the current index after a setState()
* when the current UTF-16 index is not known.
*
* It would be very inefficient to have to count from the beginning of the text
* just to get the current/limit/length index after moving relative to it.
* The actual index can be determined with getIndex(UITER_CURRENT)
* which will count the UChars if necessary.
*
* @stable ICU 2.6
*/
UITER_UNKNOWN_INDEX=-2
};
/**
* Constant for UCharIterator getState() indicating an error or
* an unknown state.
* Returned by uiter_getState()/UCharIteratorGetState
* when an error occurs.
* Also, some UCharIterator implementations may not be able to return
* a valid state for each position. This will be clearly documented
* for each such iterator (none of the public ones here).
*
* @stable ICU 2.6
*/
#define UITER_NO_STATE ((uint32_t)0xffffffff)
/**
* Function type declaration for UCharIterator.getIndex().
*
* Gets the current position, or the start or limit of the
* iteration range.
*
* This function may perform slowly for UITER_CURRENT after setState() was called,
* or for UITER_LENGTH, because an iterator implementation may have to count
* UChars if the underlying storage is not UTF-16.
*
* @param iter the UCharIterator structure ("this pointer")
* @param origin get the 0, start, limit, length, or current index
* @return the requested index, or U_SENTINEL in an error condition
*
* @see UCharIteratorOrigin
* @see UCharIterator
* @stable ICU 2.1
*/
typedef int32_t U_CALLCONV
UCharIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin);
/**
* Function type declaration for UCharIterator.move().
*
* Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index).
*
* Moves the current position relative to the start or limit of the
* iteration range, or relative to the current position itself.
* The movement is expressed in numbers of code units forward
* or backward by specifying a positive or negative delta.
* Out of bounds movement will be pinned to the start or limit.
*
* This function may perform slowly for moving relative to UITER_LENGTH
* because an iterator implementation may have to count the rest of the
* UChars if the native storage is not UTF-16.
*
* When moving relative to the limit or length, or
* relative to the current position after setState() was called,
* move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient
* determination of the actual UTF-16 index.
* The actual index can be determined with getIndex(UITER_CURRENT)
* which will count the UChars if necessary.
* See UITER_UNKNOWN_INDEX for details.
*
* @param iter the UCharIterator structure ("this pointer")
* @param delta can be positive, zero, or negative
* @param origin move relative to the 0, start, limit, length, or current index
* @return the new index, or U_SENTINEL on an error condition,
* or UITER_UNKNOWN_INDEX when the index is not known.
*
* @see UCharIteratorOrigin
* @see UCharIterator
* @see UITER_UNKNOWN_INDEX
* @stable ICU 2.1
*/
typedef int32_t U_CALLCONV
UCharIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin);
/**
* Function type declaration for UCharIterator.hasNext().
*
* Check if current() and next() can still
* return another code unit.
*
* @param iter the UCharIterator structure ("this pointer")
* @return boolean value for whether current() and next() can still return another code unit
*
* @see UCharIterator
* @stable ICU 2.1
*/
typedef UBool U_CALLCONV
UCharIteratorHasNext(UCharIterator *iter);
/**
* Function type declaration for UCharIterator.hasPrevious().
*
* Check if previous() can still return another code unit.
*
* @param iter the UCharIterator structure ("this pointer")
* @return boolean value for whether previous() can still return another code unit
*
* @see UCharIterator
* @stable ICU 2.1
*/
typedef UBool U_CALLCONV
UCharIteratorHasPrevious(UCharIterator *iter);
/**
* Function type declaration for UCharIterator.current().
*
* Return the code unit at the current position,
* or U_SENTINEL if there is none (index is at the limit).
*
* @param iter the UCharIterator structure ("this pointer")
* @return the current code unit
*
* @see UCharIterator
* @stable ICU 2.1
*/
typedef UChar32 U_CALLCONV
UCharIteratorCurrent(UCharIterator *iter);
/**
* Function type declaration for UCharIterator.next().
*
* Return the code unit at the current index and increment
* the index (post-increment, like s[i++]),
* or return U_SENTINEL if there is none (index is at the limit).
*
* @param iter the UCharIterator structure ("this pointer")
* @return the current code unit (and post-increment the current index)
*
* @see UCharIterator
* @stable ICU 2.1
*/
typedef UChar32 U_CALLCONV
UCharIteratorNext(UCharIterator *iter);
/**
* Function type declaration for UCharIterator.previous().
*
* Decrement the index and return the code unit from there
* (pre-decrement, like s[--i]),
* or return U_SENTINEL if there is none (index is at the start).
*
* @param iter the UCharIterator structure ("this pointer")
* @return the previous code unit (after pre-decrementing the current index)
*
* @see UCharIterator
* @stable ICU 2.1
*/
typedef UChar32 U_CALLCONV
UCharIteratorPrevious(UCharIterator *iter);
/**
* Function type declaration for UCharIterator.reservedFn().
* Reserved for future use.
*
* @param iter the UCharIterator structure ("this pointer")
* @param something some integer argument
* @return some integer
*
* @see UCharIterator
* @stable ICU 2.1
*/
typedef int32_t U_CALLCONV
UCharIteratorReserved(UCharIterator *iter, int32_t something);
/**
* Function type declaration for UCharIterator.getState().
*
* Get the "state" of the iterator in the form of a single 32-bit word.
* It is recommended that the state value be calculated to be as small as
* is feasible. For strings with limited lengths, fewer than 32 bits may
* be sufficient.
*
* This is used together with setState()/UCharIteratorSetState
* to save and restore the iterator position more efficiently than with
* getIndex()/move().
*
* The iterator state is defined as a uint32_t value because it is designed
* for use in ucol_nextSortKeyPart() which provides 32 bits to store the state
* of the character iterator.
*
* With some UCharIterator implementations (e.g., UTF-8),
* getting and setting the UTF-16 index with existing functions
* (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but
* relatively slow because the iterator has to "walk" from a known index
* to the requested one.
* This takes more time the farther it needs to go.
*
* An opaque state value allows an iterator implementation to provide
* an internal index (UTF-8: the source byte array index) for
* fast, constant-time restoration.
*
* After calling setState(), a getIndex(UITER_CURRENT) may be slow because
* the UTF-16 index may not be restored as well, but the iterator can deliver
* the correct text contents and move relative to the current position
* without performance degradation.
*
* Some UCharIterator implementations may not be able to return
* a valid state for each position, in which case they return UITER_NO_STATE instead.
* This will be clearly documented for each such iterator (none of the public ones here).
*
* @param iter the UCharIterator structure ("this pointer")
* @return the state word
*
* @see UCharIterator
* @see UCharIteratorSetState
* @see UITER_NO_STATE
* @stable ICU 2.6
*/
typedef uint32_t U_CALLCONV
UCharIteratorGetState(const UCharIterator *iter);
/**
* Function type declaration for UCharIterator.setState().
*
* Restore the "state" of the iterator using a state word from a getState() call.
* The iterator object need not be the same one as for which getState() was called,
* but it must be of the same type (set up using the same uiter_setXYZ function)
* and it must iterate over the same string
* (binary identical regardless of memory address).
* For more about the state word see UCharIteratorGetState.
*
* After calling setState(), a getIndex(UITER_CURRENT) may be slow because
* the UTF-16 index may not be restored as well, but the iterator can deliver
* the correct text contents and move relative to the current position
* without performance degradation.
*
* @param iter the UCharIterator structure ("this pointer")
* @param state the state word from a getState() call
* on a same-type, same-string iterator
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
*
* @see UCharIterator
* @see UCharIteratorGetState
* @stable ICU 2.6
*/
typedef void U_CALLCONV
UCharIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
/**
* C API for code unit iteration.
* This can be used as a C wrapper around
* CharacterIterator, Replaceable, or implemented using simple strings, etc.
*
* There are two roles for using UCharIterator:
*
* A "provider" sets the necessary function pointers and controls the "protected"
* fields of the UCharIterator structure. A "provider" passes a UCharIterator
* into C APIs that need a UCharIterator as an abstract, flexible string interface.
*
* Implementations of such C APIs are "callers" of UCharIterator functions;
* they only use the "public" function pointers and never access the "protected"
* fields directly.
*
* The current() and next() functions only check the current index against the
* limit, and previous() only checks the current index against the start,
* to see if the iterator already reached the end of the iteration range.
*
* The assumption - in all iterators - is that the index is moved via the API,
* which means it won't go out of bounds, or the index is modified by
* user code that knows enough about the iterator implementation to set valid
* index values.
*
* UCharIterator functions return code unit values 0..0xffff,
* or U_SENTINEL if the iteration bounds are reached.
*
* @stable ICU 2.1
*/
struct UCharIterator {
/**
* (protected) Pointer to string or wrapped object or similar.
* Not used by caller.
* @stable ICU 2.1
*/
const void *context;
/**
* (protected) Length of string or similar.
* Not used by caller.
* @stable ICU 2.1
*/
int32_t length;
/**
* (protected) Start index or similar.
* Not used by caller.
* @stable ICU 2.1
*/
int32_t start;
/**
* (protected) Current index or similar.
* Not used by caller.
* @stable ICU 2.1
*/
int32_t index;
/**
* (protected) Limit index or similar.
* Not used by caller.
* @stable ICU 2.1
*/
int32_t limit;
/**
* (protected) Used by UTF-8 iterators and possibly others.
* @stable ICU 2.1
*/
int32_t reservedField;
/**
* (public) Returns the current position or the
* start or limit index of the iteration range.
*
* @see UCharIteratorGetIndex
* @stable ICU 2.1
*/
UCharIteratorGetIndex *getIndex;
/**
* (public) Moves the current position relative to the start or limit of the
* iteration range, or relative to the current position itself.
* The movement is expressed in numbers of code units forward
* or backward by specifying a positive or negative delta.
*
* @see UCharIteratorMove
* @stable ICU 2.1
*/
UCharIteratorMove *move;
/**
* (public) Check if current() and next() can still
* return another code unit.
*
* @see UCharIteratorHasNext
* @stable ICU 2.1
*/
UCharIteratorHasNext *hasNext;
/**
* (public) Check if previous() can still return another code unit.
*
* @see UCharIteratorHasPrevious
* @stable ICU 2.1
*/
UCharIteratorHasPrevious *hasPrevious;
/**
* (public) Return the code unit at the current position,
* or U_SENTINEL if there is none (index is at the limit).
*
* @see UCharIteratorCurrent
* @stable ICU 2.1
*/
UCharIteratorCurrent *current;
/**
* (public) Return the code unit at the current index and increment
* the index (post-increment, like s[i++]),
* or return U_SENTINEL if there is none (index is at the limit).
*
* @see UCharIteratorNext
* @stable ICU 2.1
*/
UCharIteratorNext *next;
/**
* (public) Decrement the index and return the code unit from there
* (pre-decrement, like s[--i]),
* or return U_SENTINEL if there is none (index is at the start).
*
* @see UCharIteratorPrevious
* @stable ICU 2.1
*/
UCharIteratorPrevious *previous;
/**
* (public) Reserved for future use. Currently NULL.
*
* @see UCharIteratorReserved
* @stable ICU 2.1
*/
UCharIteratorReserved *reservedFn;
/**
* (public) Return the state of the iterator, to be restored later with setState().
* This function pointer is NULL if the iterator does not implement it.
*
* @see UCharIteratorGet
* @stable ICU 2.6
*/
UCharIteratorGetState *getState;
/**
* (public) Restore the iterator state from the state word from a call
* to getState().
* This function pointer is NULL if the iterator does not implement it.
*
* @see UCharIteratorSet
* @stable ICU 2.6
*/
UCharIteratorSetState *setState;
};
/**
* Helper function for UCharIterator to get the code point
* at the current index.
*
* Return the code point that includes the code unit at the current position,
* or U_SENTINEL if there is none (index is at the limit).
* If the current code unit is a lead or trail surrogate,
* then the following or preceding surrogate is used to form
* the code point value.
*
* @param iter the UCharIterator structure ("this pointer")
* @return the current code point
*
* @see UCharIterator
* @see U16_GET
* @see UnicodeString::char32At()
* @stable ICU 2.1
*/
U_CAPI UChar32 U_EXPORT2
uiter_current32(UCharIterator *iter);
/**
* Helper function for UCharIterator to get the next code point.
*
* Return the code point at the current index and increment
* the index (post-increment, like s[i++]),
* or return U_SENTINEL if there is none (index is at the limit).
*
* @param iter the UCharIterator structure ("this pointer")
* @return the current code point (and post-increment the current index)
*
* @see UCharIterator
* @see U16_NEXT
* @stable ICU 2.1
*/
U_CAPI UChar32 U_EXPORT2
uiter_next32(UCharIterator *iter);
/**
* Helper function for UCharIterator to get the previous code point.
*
* Decrement the index and return the code point from there
* (pre-decrement, like s[--i]),
* or return U_SENTINEL if there is none (index is at the start).
*
* @param iter the UCharIterator structure ("this pointer")
* @return the previous code point (after pre-decrementing the current index)
*
* @see UCharIterator
* @see U16_PREV
* @stable ICU 2.1
*/
U_CAPI UChar32 U_EXPORT2
uiter_previous32(UCharIterator *iter);
/**
* Get the "state" of the iterator in the form of a single 32-bit word.
* This is a convenience function that calls iter->getState(iter)
* if iter->getState is not NULL;
* if it is NULL or any other error occurs, then UITER_NO_STATE is returned.
*
* Some UCharIterator implementations may not be able to return
* a valid state for each position, in which case they return UITER_NO_STATE instead.
* This will be clearly documented for each such iterator (none of the public ones here).
*
* @param iter the UCharIterator structure ("this pointer")
* @return the state word
*
* @see UCharIterator
* @see UCharIteratorGetState
* @see UITER_NO_STATE
* @stable ICU 2.6
*/
U_CAPI uint32_t U_EXPORT2
uiter_getState(const UCharIterator *iter);
/**
* Restore the "state" of the iterator using a state word from a getState() call.
* This is a convenience function that calls iter->setState(iter, state, pErrorCode)
* if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set.
*
* @param iter the UCharIterator structure ("this pointer")
* @param state the state word from a getState() call
* on a same-type, same-string iterator
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
*
* @see UCharIterator
* @see UCharIteratorSetState
* @stable ICU 2.6
*/
U_CAPI void U_EXPORT2
uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode);
/**
* Set up a UCharIterator to iterate over a string.
*
* Sets the UCharIterator function pointers for iteration over the string s
* with iteration boundaries start=index=0 and length=limit=string length.
* The "provider" may set the start, index, and limit values at any time
* within the range 0..length.
* The length field will be ignored.
*
* The string pointer s is set into UCharIterator.context without copying
* or reallocating the string contents.
*
* getState() simply returns the current index.
* move() will always return the final index.
*
* @param iter UCharIterator structure to be set for iteration
* @param s String to iterate over
* @param length Length of s, or -1 if NUL-terminated
*
* @see UCharIterator
* @stable ICU 2.1
*/
U_CAPI void U_EXPORT2
uiter_setString(UCharIterator *iter, const UChar *s, int32_t length);
/**
* Set up a UCharIterator to iterate over a UTF-16BE string
* (byte vector with a big-endian pair of bytes per UChar).
*
* Everything works just like with a normal UChar iterator (uiter_setString),
* except that UChars are assembled from byte pairs,
* and that the length argument here indicates an even number of bytes.
*
* getState() simply returns the current index.
* move() will always return the final index.
*
* @param iter UCharIterator structure to be set for iteration
* @param s UTF-16BE string to iterate over
* @param length Length of s as an even number of bytes, or -1 if NUL-terminated
* (NUL means pair of 0 bytes at even index from s)
*
* @see UCharIterator
* @see uiter_setString
* @stable ICU 2.6
*/
U_CAPI void U_EXPORT2
uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length);
/**
* Set up a UCharIterator to iterate over a UTF-8 string.
*
* Sets the UCharIterator function pointers for iteration over the UTF-8 string s
* with UTF-8 iteration boundaries 0 and length.
* The implementation counts the UTF-16 index on the fly and
* lazily evaluates the UTF-16 length of the text.
*
* The start field is used as the UTF-8 offset, the limit field as the UTF-8 length.
* When the reservedField is not 0, then it contains a supplementary code point
* and the UTF-16 index is between the two corresponding surrogates.
* At that point, the UTF-8 index is behind that code point.
*
* The UTF-8 string pointer s is set into UCharIterator.context without copying
* or reallocating the string contents.
*
* getState() returns a state value consisting of
* - the current UTF-8 source byte index (bits 31..1)
* - a flag (bit 0) that indicates whether the UChar position is in the middle
* of a surrogate pair
* (from a 4-byte UTF-8 sequence for the corresponding supplementary code point)
*
* getState() cannot also encode the UTF-16 index in the state value.
* move(relative to limit or length), or
* move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX.
*
* @param iter UCharIterator structure to be set for iteration
* @param s UTF-8 string to iterate over
* @param length Length of s in bytes, or -1 if NUL-terminated
*
* @see UCharIterator
* @stable ICU 2.6
*/
U_CAPI void U_EXPORT2
uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length);
#if U_SHOW_CPLUSPLUS_API
/**
* Set up a UCharIterator to wrap around a C++ CharacterIterator.
*
* Sets the UCharIterator function pointers for iteration using the
* CharacterIterator charIter.
*
* The CharacterIterator pointer charIter is set into UCharIterator.context
* without copying or cloning the CharacterIterator object.
* The other "protected" UCharIterator fields are set to 0 and will be ignored.
* The iteration index and boundaries are controlled by the CharacterIterator.
*
* getState() simply returns the current index.
* move() will always return the final index.
*
* @param iter UCharIterator structure to be set for iteration
* @param charIter CharacterIterator to wrap
*
* @see UCharIterator
* @stable ICU 2.1
*/
U_CAPI void U_EXPORT2
uiter_setCharacterIterator(UCharIterator *iter, icu::CharacterIterator *charIter);
/**
* Set up a UCharIterator to iterate over a C++ Replaceable.
*
* Sets the UCharIterator function pointers for iteration over the
* Replaceable rep with iteration boundaries start=index=0 and
* length=limit=rep->length().
* The "provider" may set the start, index, and limit values at any time
* within the range 0..length=rep->length().
* The length field will be ignored.
*
* The Replaceable pointer rep is set into UCharIterator.context without copying
* or cloning/reallocating the Replaceable object.
*
* getState() simply returns the current index.
* move() will always return the final index.
*
* @param iter UCharIterator structure to be set for iteration
* @param rep Replaceable to iterate over
*
* @see UCharIterator
* @stable ICU 2.1
*/
U_CAPI void U_EXPORT2
uiter_setReplaceable(UCharIterator *iter, const icu::Replaceable *rep);
#endif
U_CDECL_END
#endif

View File

@@ -0,0 +1,307 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2010-2016, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
#ifndef __ULDNAMES_H__
#define __ULDNAMES_H__
/**
* \file
* \brief C API: Provides display names of Locale ids and their components.
*/
#include "unicode/utypes.h"
#include "unicode/uscript.h"
#include "unicode/udisplaycontext.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/localpointer.h"
#endif // U_SHOW_CPLUSPLUS_API
/**
* Enum used in LocaleDisplayNames::createInstance.
* @stable ICU 4.4
*/
typedef enum {
/**
* Use standard names when generating a locale name,
* e.g. en_GB displays as 'English (United Kingdom)'.
* @stable ICU 4.4
*/
ULDN_STANDARD_NAMES = 0,
/**
* Use dialect names, when generating a locale name,
* e.g. en_GB displays as 'British English'.
* @stable ICU 4.4
*/
ULDN_DIALECT_NAMES
} UDialectHandling;
/**
* Opaque C service object type for the locale display names API
* @stable ICU 4.4
*/
struct ULocaleDisplayNames;
/**
* C typedef for struct ULocaleDisplayNames.
* @stable ICU 4.4
*/
typedef struct ULocaleDisplayNames ULocaleDisplayNames;
#if !UCONFIG_NO_FORMATTING
/**
* Returns an instance of LocaleDisplayNames that returns names
* formatted for the provided locale, using the provided
* dialectHandling. The usual value for dialectHandling is
* ULOC_STANDARD_NAMES.
*
* @param locale the display locale
* @param dialectHandling how to select names for locales
* @return a ULocaleDisplayNames instance
* @param pErrorCode the status code
* @stable ICU 4.4
*/
U_CAPI ULocaleDisplayNames * U_EXPORT2
uldn_open(const char * locale,
UDialectHandling dialectHandling,
UErrorCode *pErrorCode);
/**
* Closes a ULocaleDisplayNames instance obtained from uldn_open().
* @param ldn the ULocaleDisplayNames instance to be closed
* @stable ICU 4.4
*/
U_CAPI void U_EXPORT2
uldn_close(ULocaleDisplayNames *ldn);
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalULocaleDisplayNamesPointer
* "Smart pointer" class, closes a ULocaleDisplayNames via uldn_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @stable ICU 4.4
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalULocaleDisplayNamesPointer, ULocaleDisplayNames, uldn_close);
U_NAMESPACE_END
#endif
/* getters for state */
/**
* Returns the locale used to determine the display names. This is
* not necessarily the same locale passed to {@link #uldn_open}.
* @param ldn the LocaleDisplayNames instance
* @return the display locale
* @stable ICU 4.4
*/
U_CAPI const char * U_EXPORT2
uldn_getLocale(const ULocaleDisplayNames *ldn);
/**
* Returns the dialect handling used in the display names.
* @param ldn the LocaleDisplayNames instance
* @return the dialect handling enum
* @stable ICU 4.4
*/
U_CAPI UDialectHandling U_EXPORT2
uldn_getDialectHandling(const ULocaleDisplayNames *ldn);
/* names for entire locales */
/**
* Returns the display name of the provided locale.
* @param ldn the LocaleDisplayNames instance
* @param locale the locale whose display name to return
* @param result receives the display name
* @param maxResultSize the size of the result buffer
* @param pErrorCode the status code
* @return the actual buffer size needed for the display name. If it's
* greater than maxResultSize, the returned name will be truncated.
* @stable ICU 4.4
*/
U_CAPI int32_t U_EXPORT2
uldn_localeDisplayName(const ULocaleDisplayNames *ldn,
const char *locale,
UChar *result,
int32_t maxResultSize,
UErrorCode *pErrorCode);
/* names for components of a locale */
/**
* Returns the display name of the provided language code.
* @param ldn the LocaleDisplayNames instance
* @param lang the language code whose display name to return
* @param result receives the display name
* @param maxResultSize the size of the result buffer
* @param pErrorCode the status code
* @return the actual buffer size needed for the display name. If it's
* greater than maxResultSize, the returned name will be truncated.
* @stable ICU 4.4
*/
U_CAPI int32_t U_EXPORT2
uldn_languageDisplayName(const ULocaleDisplayNames *ldn,
const char *lang,
UChar *result,
int32_t maxResultSize,
UErrorCode *pErrorCode);
/**
* Returns the display name of the provided script.
* @param ldn the LocaleDisplayNames instance
* @param script the script whose display name to return
* @param result receives the display name
* @param maxResultSize the size of the result buffer
* @param pErrorCode the status code
* @return the actual buffer size needed for the display name. If it's
* greater than maxResultSize, the returned name will be truncated.
* @stable ICU 4.4
*/
U_CAPI int32_t U_EXPORT2
uldn_scriptDisplayName(const ULocaleDisplayNames *ldn,
const char *script,
UChar *result,
int32_t maxResultSize,
UErrorCode *pErrorCode);
/**
* Returns the display name of the provided script code.
* @param ldn the LocaleDisplayNames instance
* @param scriptCode the script code whose display name to return
* @param result receives the display name
* @param maxResultSize the size of the result buffer
* @param pErrorCode the status code
* @return the actual buffer size needed for the display name. If it's
* greater than maxResultSize, the returned name will be truncated.
* @stable ICU 4.4
*/
U_CAPI int32_t U_EXPORT2
uldn_scriptCodeDisplayName(const ULocaleDisplayNames *ldn,
UScriptCode scriptCode,
UChar *result,
int32_t maxResultSize,
UErrorCode *pErrorCode);
/**
* Returns the display name of the provided region code.
* @param ldn the LocaleDisplayNames instance
* @param region the region code whose display name to return
* @param result receives the display name
* @param maxResultSize the size of the result buffer
* @param pErrorCode the status code
* @return the actual buffer size needed for the display name. If it's
* greater than maxResultSize, the returned name will be truncated.
* @stable ICU 4.4
*/
U_CAPI int32_t U_EXPORT2
uldn_regionDisplayName(const ULocaleDisplayNames *ldn,
const char *region,
UChar *result,
int32_t maxResultSize,
UErrorCode *pErrorCode);
/**
* Returns the display name of the provided variant
* @param ldn the LocaleDisplayNames instance
* @param variant the variant whose display name to return
* @param result receives the display name
* @param maxResultSize the size of the result buffer
* @param pErrorCode the status code
* @return the actual buffer size needed for the display name. If it's
* greater than maxResultSize, the returned name will be truncated.
* @stable ICU 4.4
*/
U_CAPI int32_t U_EXPORT2
uldn_variantDisplayName(const ULocaleDisplayNames *ldn,
const char *variant,
UChar *result,
int32_t maxResultSize,
UErrorCode *pErrorCode);
/**
* Returns the display name of the provided locale key
* @param ldn the LocaleDisplayNames instance
* @param key the locale key whose display name to return
* @param result receives the display name
* @param maxResultSize the size of the result buffer
* @param pErrorCode the status code
* @return the actual buffer size needed for the display name. If it's
* greater than maxResultSize, the returned name will be truncated.
* @stable ICU 4.4
*/
U_CAPI int32_t U_EXPORT2
uldn_keyDisplayName(const ULocaleDisplayNames *ldn,
const char *key,
UChar *result,
int32_t maxResultSize,
UErrorCode *pErrorCode);
/**
* Returns the display name of the provided value (used with the provided key).
* @param ldn the LocaleDisplayNames instance
* @param key the locale key
* @param value the locale key's value
* @param result receives the display name
* @param maxResultSize the size of the result buffer
* @param pErrorCode the status code
* @return the actual buffer size needed for the display name. If it's
* greater than maxResultSize, the returned name will be truncated.
* @stable ICU 4.4
*/
U_CAPI int32_t U_EXPORT2
uldn_keyValueDisplayName(const ULocaleDisplayNames *ldn,
const char *key,
const char *value,
UChar *result,
int32_t maxResultSize,
UErrorCode *pErrorCode);
/**
* Returns an instance of LocaleDisplayNames that returns names formatted
* for the provided locale, using the provided UDisplayContext settings.
*
* @param locale The display locale
* @param contexts List of one or more context settings (e.g. for dialect
* handling, capitalization, etc.
* @param length Number of items in the contexts list
* @param pErrorCode Pointer to UErrorCode input/output status. If at entry this indicates
* a failure status, the function will do nothing; otherwise this will be
* updated with any new status from the function.
* @return a ULocaleDisplayNames instance
* @stable ICU 51
*/
U_CAPI ULocaleDisplayNames * U_EXPORT2
uldn_openForContext(const char * locale, UDisplayContext *contexts,
int32_t length, UErrorCode *pErrorCode);
/**
* Returns the UDisplayContext value for the specified UDisplayContextType.
* @param ldn the ULocaleDisplayNames instance
* @param type the UDisplayContextType whose value to return
* @param pErrorCode Pointer to UErrorCode input/output status. If at entry this indicates
* a failure status, the function will do nothing; otherwise this will be
* updated with any new status from the function.
* @return the UDisplayContextValue for the specified type.
* @stable ICU 51
*/
U_CAPI UDisplayContext U_EXPORT2
uldn_getContext(const ULocaleDisplayNames *ldn, UDisplayContextType type,
UErrorCode *pErrorCode);
#endif /* !UCONFIG_NO_FORMATTING */
#endif /* __ULDNAMES_H__ */

1410
thirdparty/icu4c/common/unicode/uloc.h vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,226 @@
// © 2023 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#ifndef ULOCALE_H
#define ULOCALE_H
#include "unicode/localpointer.h"
#include "unicode/uenum.h"
#include "unicode/utypes.h"
/**
* \file
* \brief C API: Locale ID functionality similar to C++ class Locale
*/
/**
* Opaque C service object type for the locale API
* @stable ICU 74
*/
struct ULocale;
/**
* C typedef for struct ULocale.
* @stable ICU 74
*/
typedef struct ULocale ULocale;
/**
* Constructs an ULocale from the locale ID.
* The created ULocale should be destroyed by calling
* ulocale_close();
* @param localeID the locale, a const char * pointer (need not be terminated when
* the length is non-negative)
* @param length the length of the locale; if negative, then the locale need to be
* null terminated.
* @param err the error code
* @return the locale.
*
* @stable ICU 74
*/
U_CAPI ULocale* U_EXPORT2
ulocale_openForLocaleID(const char* localeID, int32_t length, UErrorCode* err);
/**
* Constructs an ULocale from the provided IETF BCP 47 language tag.
* The created ULocale should be destroyed by calling
* ulocale_close();
* @param tag the language tag, defined as IETF BCP 47 language tag, const
* char* pointer (need not be terminated when the length is non-negative)
* @param length the length of the tag; if negative, then the tag need to be
* null terminated.
* @param err the error code
* @return the locale.
*
* @stable ICU 74
*/
U_CAPI ULocale* U_EXPORT2
ulocale_openForLanguageTag(const char* tag, int32_t length, UErrorCode* err);
/**
* Close the locale and destroy it's internal states.
*
* @param locale the locale
* @stable ICU 74
*/
U_CAPI void U_EXPORT2
ulocale_close(ULocale* locale);
/**
* Returns the locale's ISO-639 language code.
*
* @param locale the locale
* @return the language code of the locale.
* @stable ICU 74
*/
U_CAPI const char* U_EXPORT2
ulocale_getLanguage(const ULocale* locale);
/**
* Returns the locale's ISO-15924 abbreviation script code.
*
* @param locale the locale
* @return A pointer to the script.
* @stable ICU 74
*/
U_CAPI const char* U_EXPORT2
ulocale_getScript(const ULocale* locale);
/**
* Returns the locale's ISO-3166 region code.
*
* @param locale the locale
* @return A pointer to the region.
* @stable ICU 74
*/
U_CAPI const char* U_EXPORT2
ulocale_getRegion(const ULocale* locale);
/**
* Returns the locale's variant code.
*
* @param locale the locale
* @return A pointer to the variant.
* @stable ICU 74
*/
U_CAPI const char* U_EXPORT2
ulocale_getVariant(const ULocale* locale);
/**
* Returns the programmatic name of the entire locale, with the language,
* country and variant separated by underbars. If a field is missing, up
* to two leading underbars will occur. Example: "en", "de_DE", "en_US_WIN",
* "de__POSIX", "fr__MAC", "__MAC", "_MT", "_FR_EURO"
*
* @param locale the locale
* @return A pointer to "name".
* @stable ICU 74
*/
U_CAPI const char* U_EXPORT2
ulocale_getLocaleID(const ULocale* locale);
/**
* Returns the programmatic name of the entire locale as ulocale_getLocaleID()
* would return, but without keywords.
*
* @param locale the locale
* @return A pointer to "base name".
* @stable ICU 74
*/
U_CAPI const char* U_EXPORT2
ulocale_getBaseName(const ULocale* locale);
/**
* Gets the bogus state. Locale object can be bogus if it doesn't exist
*
* @param locale the locale
* @return false if it is a real locale, true if it is a bogus locale
* @stable ICU 74
*/
U_CAPI bool U_EXPORT2
ulocale_isBogus(const ULocale* locale);
/**
* Gets the list of keywords for the specified locale.
*
* @param locale the locale
* @param err the error code
* @return pointer to UEnumeration, or nullptr if there are no keywords.
* Client must call uenum_close() to dispose the returned value.
* @stable ICU 74
*/
U_CAPI UEnumeration* U_EXPORT2
ulocale_getKeywords(const ULocale* locale, UErrorCode *err);
/**
* Gets the list of unicode keywords for the specified locale.
*
* @param locale the locale
* @param err the error code
* @return pointer to UEnumeration, or nullptr if there are no keywords.
* Client must call uenum_close() to dispose the returned value.
* @stable ICU 74
*/
U_CAPI UEnumeration* U_EXPORT2
ulocale_getUnicodeKeywords(const ULocale* locale, UErrorCode *err);
/**
* Gets the value for a keyword.
*
* This uses legacy keyword=value pairs, like "collation=phonebook".
*
* @param locale the locale
* @param keyword the keyword, a const char * pointer (need not be
* terminated when the length is non-negative)
* @param keywordLength the length of the keyword; if negative, then the
* keyword need to be null terminated.
* @param valueBuffer The buffer to receive the value.
* @param valueBufferCapacity The capacity of receiving valueBuffer.
* @param err the error code
* @stable ICU 74
*/
U_CAPI int32_t U_EXPORT2
ulocale_getKeywordValue(
const ULocale* locale, const char* keyword, int32_t keywordLength,
char* valueBuffer, int32_t valueBufferCapacity, UErrorCode *err);
/**
* Gets the Unicode value for a Unicode keyword.
*
* This uses Unicode key-value pairs, like "co-phonebk".
*
* @param locale the locale
* @param keyword the Unicode keyword, a const char * pointer (need not be
* terminated when the length is non-negative)
* @param keywordLength the length of the Unicode keyword; if negative,
* then the keyword need to be null terminated.
* @param valueBuffer The buffer to receive the Unicode value.
* @param valueBufferCapacity The capacity of receiving valueBuffer.
* @param err the error code
* @stable ICU 74
*/
U_CAPI int32_t U_EXPORT2
ulocale_getUnicodeKeywordValue(
const ULocale* locale, const char* keyword, int32_t keywordLength,
char* valueBuffer, int32_t valueBufferCapacity, UErrorCode *err);
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalULocalePointer
* "Smart pointer" class, closes a ULocale via ulocale_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @stable ICU 74
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalULocalePointer, ULocale, ulocale_close);
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif /*_ULOCALE */

View File

@@ -0,0 +1,437 @@
// © 2023 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#ifndef __ULOCBUILDER_H__
#define __ULOCBUILDER_H__
#include "unicode/localpointer.h"
#include "unicode/ulocale.h"
#include "unicode/utypes.h"
/**
* \file
* \brief C API: Builder API for Locale
*/
/**
* Opaque C service object type for the locale builder API
* @stable ICU 74
*/
struct ULocaleBuilder;
/**
* C typedef for struct ULocaleBuilder.
* @stable ICU 74
*/
typedef struct ULocaleBuilder ULocaleBuilder;
/**
* <code>ULocaleBuilder</code> is used to build valid <code>locale</code> id
* string or IETF BCP 47 language tag from values configured by the setters.
* The <code>ULocaleBuilder</code> checks if a value configured by a
* setter satisfies the syntax requirements defined by the <code>Locale</code>
* class. A string of Locale created by a <code>ULocaleBuilder</code> is
* well-formed and can be transformed to a well-formed IETF BCP 47 language tag
* without losing information.
*
* <p>The following example shows how to create a <code>locale</code> string
* with the <code>ULocaleBuilder</code>.
* <blockquote>
* <pre>
* UErrorCode err = U_ZERO_ERROR;
* char buffer[ULOC_FULLNAME_CAPACITY];
* ULocaleBuilder* builder = ulocbld_open();
* ulocbld_setLanguage(builder, "sr", -1);
* ulocbld_setScript(builder, "Latn", -1);
* ulocbld_setRegion(builder, "RS", -1);
* int32_t length = ulocbld_buildLocaleID(
* builder, buffer, ULOC_FULLNAME_CAPACITY, &error);
* ulocbld_close(builder);
* </pre>
* </blockquote>
*
* <p>ULocaleBuilders can be reused; <code>ulocbld_clear()</code> resets all
* fields to their default values.
*
* <p>ULocaleBuilder tracks errors in an internal UErrorCode. For all setters,
* except ulocbld_setLanguageTag and ulocbld_setLocale, ULocaleBuilder will return immediately
* if the internal UErrorCode is in error state.
* To reset internal state and error code, call clear method.
* The ulocbld_setLanguageTag and setLocale method will first clear the internal
* UErrorCode, then track the error of the validation of the input parameter
* into the internal UErrorCode.
*
* @stable ICU 74
*/
/**
* Constructs an empty ULocaleBuilder. The default value of all
* fields, extensions, and private use information is the
* empty string. The created builder should be destroyed by calling
* ulocbld_close();
*
* @stable ICU 74
*/
U_CAPI ULocaleBuilder* U_EXPORT2
ulocbld_open(void);
/**
* Close the builder and destroy it's internal states.
* @param builder the builder
* @stable ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_close(ULocaleBuilder* builder);
/**
* Resets the <code>ULocaleBuilder</code> to match the provided
* <code>locale</code>. Existing state is discarded.
*
* <p>All fields of the locale must be well-formed.
* <p>This method clears the internal UErrorCode.
*
* @param builder the builder
* @param locale the locale, a const char * pointer (need not be terminated when
* the length is non-negative)
* @param length the length of the locale; if negative, then the locale need to be
* null terminated,
*
* @stable ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_setLocale(ULocaleBuilder* builder, const char* locale, int32_t length);
/**
* Resets the <code>ULocaleBuilder</code> to match the provided
* <code>ULocale</code>. Existing state is discarded.
*
* <p>The locale must be not bogus.
* <p>This method clears the internal UErrorCode.
*
* @param builder the builder.
* @param locale the locale, a ULocale* pointer. The builder adopts the locale
* after the call and the client must not delete it.
*
* @stable ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_adoptULocale(ULocaleBuilder* builder, ULocale* locale);
/**
* Resets the ULocaleBuilder to match the provided IETF BCP 47 language tag.
* Discards the existing state.
* The empty string causes the builder to be reset, like {@link #ulocbld_clear}.
* Legacy language tags (marked as “Type: grandfathered” in BCP 47)
* are converted to their canonical form before being processed.
* Otherwise, the <code>language tag</code> must be well-formed,
* or else the ulocbld_buildLocaleID() and ulocbld_buildLanguageTag() methods
* will later report an U_ILLEGAL_ARGUMENT_ERROR.
*
* <p>This method clears the internal UErrorCode.
*
* @param builder the builder
* @param tag the language tag, defined as IETF BCP 47 language tag, a
* const char * pointer (need not be terminated when
* the length is non-negative)
* @param length the length of the tag; if negative, then the tag need to be
* null terminated,
* @stable ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_setLanguageTag(ULocaleBuilder* builder, const char* tag, int32_t length);
/**
* Sets the language. If <code>language</code> is the empty string, the
* language in this <code>ULocaleBuilder</code> is removed. Otherwise, the
* <code>language</code> must be well-formed, or else the ulocbld_buildLocaleID()
* and ulocbld_buildLanguageTag() methods will
* later report an U_ILLEGAL_ARGUMENT_ERROR.
*
* <p>The syntax of language value is defined as
* [unicode_language_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_language_subtag).
*
* @param builder the builder
* @param language the language, a const char * pointer (need not be terminated when
* the length is non-negative)
* @param length the length of the language; if negative, then the language need to be
* null terminated,
* @stable ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_setLanguage(ULocaleBuilder* builder, const char* language, int32_t length);
/**
* Sets the script. If <code>script</code> is the empty string, the script in
* this <code>ULocaleBuilder</code> is removed.
* Otherwise, the <code>script</code> must be well-formed, or else the
* ulocbld_buildLocaleID() and ulocbld_buildLanguageTag() methods will later
* report an U_ILLEGAL_ARGUMENT_ERROR.
*
* <p>The script value is a four-letter script code as
* [unicode_script_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_script_subtag)
* defined by ISO 15924
*
* @param builder the builder
* @param script the script, a const char * pointer (need not be terminated when
* the length is non-negative)
* @param length the length of the script; if negative, then the script need to be
* null terminated,
* @stable ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_setScript(ULocaleBuilder* builder, const char* script, int32_t length);
/**
* Sets the region. If region is the empty string, the region in this
* <code>ULocaleBuilder</code> is removed. Otherwise, the <code>region</code>
* must be well-formed, or else the ulocbld_buildLocaleID() and
* ulocbld_buildLanguageTag() methods will later report an
* U_ILLEGAL_ARGUMENT_ERROR.
*
* <p>The region value is defined by
* [unicode_region_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_region_subtag)
* as a two-letter ISO 3166 code or a three-digit UN M.49 area code.
*
* <p>The region value in the <code>Locale</code> created by the
* <code>ULocaleBuilder</code> is always normalized to upper case.
*
* @param builder the builder
* @param region the region, a const char * pointer (need not be terminated when
* the length is non-negative)
* @param length the length of the region; if negative, then the region need to be
* null terminated,
* @stable ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_setRegion(ULocaleBuilder* builder, const char* region, int32_t length);
/**
* Sets the variant. If variant is the empty string, the variant in this
* <code>ULocaleBuilder</code> is removed. Otherwise, the <code>variant</code>
* must be well-formed, or else the ulocbld_buildLocaleID() and
* ulocbld_buildLanguageTag() methods will later report an
* U_ILLEGAL_ARGUMENT_ERROR.
*
* <p><b>Note:</b> This method checks if <code>variant</code>
* satisfies the
* [unicode_variant_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_variant_subtag)
* syntax requirements, and normalizes the value to lowercase letters. However,
* the <code>Locale</code> class does not impose any syntactic
* restriction on variant. To set an ill-formed variant, use a Locale constructor.
* If there are multiple unicode_variant_subtag, the caller must concatenate
* them with '-' as separator (ex: "foobar-fibar").
*
* @param builder the builder
* @param variant the variant, a const char * pointer (need not be terminated when
* the length is non-negative)
* @param length the length of the variant; if negative, then the variant need to be
* null terminated,
* @stable ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_setVariant(ULocaleBuilder* builder, const char* variant, int32_t length);
/**
* Sets the extension for the given key. If the value is the empty string,
* the extension is removed. Otherwise, the <code>key</code> and
* <code>value</code> must be well-formed, or else the ulocbld_buildLocaleID()
* and ulocbld_buildLanguageTag() methods will
* later report an U_ILLEGAL_ARGUMENT_ERROR.
*
* <p><b>Note:</b> The key ('u') is used for the Unicode locale extension.
* Setting a value for this key replaces any existing Unicode locale key/type
* pairs with those defined in the extension.
*
* <p><b>Note:</b> The key ('x') is used for the private use code. To be
* well-formed, the value for this key needs only to have subtags of one to
* eight alphanumeric characters, not two to eight as in the general case.
*
* @param builder the builder
* @param key the extension key
* @param value the value, a const char * pointer (need not be terminated when
* the length is non-negative)
* @param length the length of the value; if negative, then the value need to be
* null terminated,
* @stable ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_setExtension(ULocaleBuilder* builder, char key, const char* value, int32_t length);
/**
* Sets the Unicode locale keyword type for the given key. If the type
* StringPiece is constructed with a nullptr, the keyword is removed.
* If the type is the empty string, the keyword is set without type subtags.
* Otherwise, the key and type must be well-formed, or else the
* ulocbld_buildLocaleID() and ulocbld_buildLanguageTag() methods will later
* report an U_ILLEGAL_ARGUMENT_ERROR.
*
* <p>Keys and types are converted to lower case.
*
* <p><b>Note</b>:Setting the 'u' extension via {@link #ulocbld_setExtension}
* replaces all Unicode locale keywords with those defined in the
* extension.
*
* @param builder the builder
* @param key the Unicode locale key, a const char * pointer (need not be
* terminated when the length is non-negative)
* @param keyLength the length of the key; if negative, then the key need to be
* null terminated,
* @param type the Unicode locale type, a const char * pointer (need not be
* terminated when the length is non-negative)
* @param typeLength the length of the type; if negative, then the type need to
* be null terminated,
* @return This builder.
* @stable ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_setUnicodeLocaleKeyword(ULocaleBuilder* builder,
const char* key, int32_t keyLength, const char* type, int32_t typeLength);
/**
* Adds a unicode locale attribute, if not already present, otherwise
* has no effect. The attribute must not be empty string and must be
* well-formed or U_ILLEGAL_ARGUMENT_ERROR will be set to status
* during the ulocbld_buildLocaleID() and ulocbld_buildLanguageTag() calls.
*
* @param builder the builder
* @param attribute the attribute, a const char * pointer (need not be
* terminated when the length is non-negative)
* @param length the length of the attribute; if negative, then the attribute
* need to be null terminated,
* @stable ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_addUnicodeLocaleAttribute(
ULocaleBuilder* builder, const char* attribute, int32_t length);
/**
* Removes a unicode locale attribute, if present, otherwise has no
* effect. The attribute must not be empty string and must be well-formed
* or U_ILLEGAL_ARGUMENT_ERROR will be set to status during the ulocbld_buildLocaleID()
* and ulocbld_buildLanguageTag() calls.
*
* <p>Attribute comparison for removal is case-insensitive.
*
* @param builder the builder
* @param attribute the attribute, a const char * pointer (need not be
* terminated when the length is non-negative)
* @param length the length of the attribute; if negative, then the attribute
* need to be null terminated,
* @stable ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_removeUnicodeLocaleAttribute(
ULocaleBuilder* builder, const char* attribute, int32_t length);
/**
* Resets the builder to its initial, empty state.
* <p>This method clears the internal UErrorCode.
*
* @param builder the builder
* @stable ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_clear(ULocaleBuilder* builder);
/**
* Resets the extensions to their initial, empty state.
* Language, script, region and variant are unchanged.
*
* @param builder the builder
* @stable ICU 74
*/
U_CAPI void U_EXPORT2
ulocbld_clearExtensions(ULocaleBuilder* builder);
/**
* Build the LocaleID string from the fields set on this builder.
* If any set methods or during the ulocbld_buildLocaleID() call require memory
* allocation but fail U_MEMORY_ALLOCATION_ERROR will be set to status.
* If any of the fields set by the setters are not well-formed, the status
* will be set to U_ILLEGAL_ARGUMENT_ERROR. The state of the builder will
* not change after the ulocbld_buildLocaleID() call and the caller is
* free to keep using the same builder to build more locales.
*
* @param builder the builder
* @param locale the locale id
* @param localeCapacity the size of the locale buffer to store the locale id
* @param err the error code
* @return the length of the locale id in buffer
* @stable ICU 74
*/
U_CAPI int32_t U_EXPORT2
ulocbld_buildLocaleID(ULocaleBuilder* builder, char* locale,
int32_t localeCapacity, UErrorCode* err);
/**
* Build the ULocale object from the fields set on this builder.
* If any set methods or during the ulocbld_buildULocale() call require memory
* allocation but fail U_MEMORY_ALLOCATION_ERROR will be set to status.
* If any of the fields set by the setters are not well-formed, the status
* will be set to U_ILLEGAL_ARGUMENT_ERROR. The state of the builder will
* not change after the ulocbld_buildULocale() call and the caller is
* free to keep using the same builder to build more locales.
*
* @param builder the builder.
* @param err the error code.
* @return the locale, a ULocale* pointer. The created ULocale must be
* destroyed by calling {@link ulocale_close}.
* @stable ICU 74
*/
U_CAPI ULocale* U_EXPORT2
ulocbld_buildULocale(ULocaleBuilder* builder, UErrorCode* err);
/**
* Build the IETF BCP 47 language tag string from the fields set on this builder.
* If any set methods or during the ulocbld_buildLanguageTag() call require memory
* allocation but fail U_MEMORY_ALLOCATION_ERROR will be set to status.
* If any of the fields set by the setters are not well-formed, the status
* will be set to U_ILLEGAL_ARGUMENT_ERROR. The state of the builder will
* not change after the ulocbld_buildLanguageTag() call and the caller is free
* to keep using the same builder to build more locales.
*
* @param builder the builder
* @param language the language tag
* @param languageCapacity the size of the language buffer to store the language
* tag
* @param err the error code
* @return the length of the language tag in buffer
* @stable ICU 74
*/
U_CAPI int32_t U_EXPORT2
ulocbld_buildLanguageTag(ULocaleBuilder* builder, char* language,
int32_t languageCapacity, UErrorCode* err);
/**
* Sets the UErrorCode if an error occurred while recording sets.
* Preserves older error codes in the outErrorCode.
*
* @param builder the builder
* @param outErrorCode Set to an error code that occurred while setting subtags.
* Unchanged if there is no such error or if outErrorCode
* already contained an error.
* @return true if U_FAILURE(*outErrorCode)
* @stable ICU 74
*/
U_CAPI UBool U_EXPORT2
ulocbld_copyErrorTo(const ULocaleBuilder* builder, UErrorCode *outErrorCode);
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalULocaleBuilderPointer
* "Smart pointer" class, closes a ULocaleBuilder via ulocbld_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @stable ICU 74
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalULocaleBuilderPointer, ULocaleBuilder, ulocbld_close);
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif // __ULOCBUILDER_H__

View File

@@ -0,0 +1,451 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 1999-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: umachine.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999sep13
* created by: Markus W. Scherer
*
* This file defines basic types and constants for ICU to be
* platform-independent. umachine.h and utf.h are included into
* utypes.h to provide all the general definitions for ICU.
* All of these definitions used to be in utypes.h before
* the UTF-handling macros made this unmaintainable.
*/
#ifndef __UMACHINE_H__
#define __UMACHINE_H__
/**
* \file
* \brief Basic types and constants for UTF
*
* <h2> Basic types and constants for UTF </h2>
* This file defines basic types and constants for utf.h to be
* platform-independent. umachine.h and utf.h are included into
* utypes.h to provide all the general definitions for ICU.
* All of these definitions used to be in utypes.h before
* the UTF-handling macros made this unmaintainable.
*
*/
/*==========================================================================*/
/* Include platform-dependent definitions */
/* which are contained in the platform-specific file platform.h */
/*==========================================================================*/
#include "unicode/ptypes.h" /* platform.h is included in ptypes.h */
/*
* ANSI C headers:
* stddef.h defines wchar_t
*/
#include <stdbool.h>
#include <stddef.h>
/*==========================================================================*/
/* For C wrappers, we use the symbol U_CAPI. */
/* This works properly if the includer is C or C++. */
/* Functions are declared U_CAPI return-type U_EXPORT2 function-name()... */
/*==========================================================================*/
/**
* \def U_CFUNC
* This is used in a declaration of a library private ICU C function.
* @stable ICU 2.4
*/
/**
* \def U_CDECL_BEGIN
* This is used to begin a declaration of a library private ICU C API.
* @stable ICU 2.4
*/
/**
* \def U_CDECL_END
* This is used to end a declaration of a library private ICU C API
* @stable ICU 2.4
*/
#ifdef __cplusplus
# define U_CFUNC extern "C"
# define U_CDECL_BEGIN extern "C" {
# define U_CDECL_END }
#else
# define U_CFUNC extern
# define U_CDECL_BEGIN
# define U_CDECL_END
#endif
#ifndef U_ATTRIBUTE_DEPRECATED
/**
* \def U_ATTRIBUTE_DEPRECATED
* This is used for GCC specific attributes
* @internal
*/
#if U_GCC_MAJOR_MINOR >= 302
# define U_ATTRIBUTE_DEPRECATED __attribute__ ((deprecated))
/**
* \def U_ATTRIBUTE_DEPRECATED
* This is used for Visual C++ specific attributes
* @internal
*/
#elif defined(_MSC_VER) && (_MSC_VER >= 1400)
# define U_ATTRIBUTE_DEPRECATED __declspec(deprecated)
#else
# define U_ATTRIBUTE_DEPRECATED
#endif
#endif
/** This is used to declare a function as a public ICU C API @stable ICU 2.0*/
#define U_CAPI U_CFUNC U_EXPORT
/** Obsolete/same as U_CAPI; was used to declare a function as a stable public ICU C API*/
#define U_STABLE U_CAPI
/** Obsolete/same as U_CAPI; was used to declare a function as a draft public ICU C API */
#define U_DRAFT U_CAPI
/** This is used to declare a function as a deprecated public ICU C API */
#define U_DEPRECATED U_CAPI U_ATTRIBUTE_DEPRECATED
/** Obsolete/same as U_CAPI; was used to declare a function as an obsolete public ICU C API */
#define U_OBSOLETE U_CAPI
/** Obsolete/same as U_CAPI; was used to declare a function as an internal ICU C API */
#define U_INTERNAL U_CAPI
// Before ICU 65, function-like, multi-statement ICU macros were just defined as
// series of statements wrapped in { } blocks and the caller could choose to
// either treat them as if they were actual functions and end the invocation
// with a trailing ; creating an empty statement after the block or else omit
// this trailing ; using the knowledge that the macro would expand to { }.
//
// But doing so doesn't work well with macros that look like functions and
// compiler warnings about empty statements (ICU-20601) and ICU 65 therefore
// switches to the standard solution of wrapping such macros in do { } while.
//
// This will however break existing code that depends on being able to invoke
// these macros without a trailing ; so to be able to remain compatible with
// such code the wrapper is itself defined as macros so that it's possible to
// build ICU 65 and later with the old macro behaviour, like this:
//
// export CPPFLAGS='-DUPRV_BLOCK_MACRO_BEGIN="" -DUPRV_BLOCK_MACRO_END=""'
// runConfigureICU ...
//
/**
* \def UPRV_BLOCK_MACRO_BEGIN
* Defined as the "do" keyword by default.
* @internal
*/
#ifndef UPRV_BLOCK_MACRO_BEGIN
#define UPRV_BLOCK_MACRO_BEGIN do
#endif
/**
* \def UPRV_BLOCK_MACRO_END
* Defined as "while (false)" by default.
* @internal
*/
#ifndef UPRV_BLOCK_MACRO_END
#define UPRV_BLOCK_MACRO_END while (false)
#endif
/*==========================================================================*/
/* limits for int32_t etc., like in POSIX inttypes.h */
/*==========================================================================*/
#ifndef INT8_MIN
/** The smallest value an 8 bit signed integer can hold @stable ICU 2.0 */
# define INT8_MIN ((int8_t)(-128))
#endif
#ifndef INT16_MIN
/** The smallest value a 16 bit signed integer can hold @stable ICU 2.0 */
# define INT16_MIN ((int16_t)(-32767-1))
#endif
#ifndef INT32_MIN
/** The smallest value a 32 bit signed integer can hold @stable ICU 2.0 */
# define INT32_MIN ((int32_t)(-2147483647-1))
#endif
#ifndef INT8_MAX
/** The largest value an 8 bit signed integer can hold @stable ICU 2.0 */
# define INT8_MAX ((int8_t)(127))
#endif
#ifndef INT16_MAX
/** The largest value a 16 bit signed integer can hold @stable ICU 2.0 */
# define INT16_MAX ((int16_t)(32767))
#endif
#ifndef INT32_MAX
/** The largest value a 32 bit signed integer can hold @stable ICU 2.0 */
# define INT32_MAX ((int32_t)(2147483647))
#endif
#ifndef UINT8_MAX
/** The largest value an 8 bit unsigned integer can hold @stable ICU 2.0 */
# define UINT8_MAX ((uint8_t)(255U))
#endif
#ifndef UINT16_MAX
/** The largest value a 16 bit unsigned integer can hold @stable ICU 2.0 */
# define UINT16_MAX ((uint16_t)(65535U))
#endif
#ifndef UINT32_MAX
/** The largest value a 32 bit unsigned integer can hold @stable ICU 2.0 */
# define UINT32_MAX ((uint32_t)(4294967295U))
#endif
#if defined(U_INT64_T_UNAVAILABLE)
# error int64_t is required for decimal format and rule-based number format.
#else
# ifndef INT64_C
/**
* Provides a platform independent way to specify a signed 64-bit integer constant.
* note: may be wrong for some 64 bit platforms - ensure your compiler provides INT64_C
* @stable ICU 2.8
*/
# define INT64_C(c) c ## LL
# endif
# ifndef UINT64_C
/**
* Provides a platform independent way to specify an unsigned 64-bit integer constant.
* note: may be wrong for some 64 bit platforms - ensure your compiler provides UINT64_C
* @stable ICU 2.8
*/
# define UINT64_C(c) c ## ULL
# endif
# ifndef U_INT64_MIN
/** The smallest value a 64 bit signed integer can hold @stable ICU 2.8 */
# define U_INT64_MIN ((int64_t)(INT64_C(-9223372036854775807)-1))
# endif
# ifndef U_INT64_MAX
/** The largest value a 64 bit signed integer can hold @stable ICU 2.8 */
# define U_INT64_MAX ((int64_t)(INT64_C(9223372036854775807)))
# endif
# ifndef U_UINT64_MAX
/** The largest value a 64 bit unsigned integer can hold @stable ICU 2.8 */
# define U_UINT64_MAX ((uint64_t)(UINT64_C(18446744073709551615)))
# endif
#endif
/*==========================================================================*/
/* Boolean data type */
/*==========================================================================*/
/**
* The ICU boolean type, a signed-byte integer.
* ICU-specific for historical reasons: The C and C++ standards used to not define type bool.
* Also provides a fixed type definition, as opposed to
* type bool whose details (e.g., sizeof) may vary by compiler and between C and C++.
*
* @stable ICU 2.0
*/
typedef int8_t UBool;
/**
* \def U_DEFINE_FALSE_AND_TRUE
* Normally turns off defining macros FALSE=0 & TRUE=1 in public ICU headers.
* These obsolete macros sometimes break compilation of other code that
* defines enum constants or similar with these names.
* C++ has long defined bool/false/true.
* C99 also added definitions for these, although as macros; see stdbool.h.
*
* You may transitionally define U_DEFINE_FALSE_AND_TRUE=1 if you need time to migrate code.
*
* @internal ICU 68
*/
#ifdef U_DEFINE_FALSE_AND_TRUE
// Use the predefined value.
#else
// Default to avoiding collision with non-macro definitions of FALSE & TRUE.
# define U_DEFINE_FALSE_AND_TRUE 0
#endif
#if U_DEFINE_FALSE_AND_TRUE || defined(U_IN_DOXYGEN)
#ifndef TRUE
/**
* The TRUE value of a UBool.
*
* @deprecated ICU 68 Use standard "true" instead.
*/
# define TRUE 1
#endif
#ifndef FALSE
/**
* The FALSE value of a UBool.
*
* @deprecated ICU 68 Use standard "false" instead.
*/
# define FALSE 0
#endif
#endif // U_DEFINE_FALSE_AND_TRUE
/*==========================================================================*/
/* Unicode data types */
/*==========================================================================*/
/* wchar_t-related definitions -------------------------------------------- */
/*
* \def U_WCHAR_IS_UTF16
* Defined if wchar_t uses UTF-16.
*
* @stable ICU 2.0
*/
/*
* \def U_WCHAR_IS_UTF32
* Defined if wchar_t uses UTF-32.
*
* @stable ICU 2.0
*/
#if !defined(U_WCHAR_IS_UTF16) && !defined(U_WCHAR_IS_UTF32)
# ifdef __STDC_ISO_10646__
# if (U_SIZEOF_WCHAR_T==2)
# define U_WCHAR_IS_UTF16
# elif (U_SIZEOF_WCHAR_T==4)
# define U_WCHAR_IS_UTF32
# endif
# elif defined __UCS2__
# if (U_PF_OS390 <= U_PLATFORM && U_PLATFORM <= U_PF_OS400) && (U_SIZEOF_WCHAR_T==2)
# define U_WCHAR_IS_UTF16
# endif
# elif defined(__UCS4__) || (U_PLATFORM == U_PF_OS400 && defined(__UTF32__))
# if (U_SIZEOF_WCHAR_T==4)
# define U_WCHAR_IS_UTF32
# endif
# elif U_PLATFORM_IS_DARWIN_BASED || (U_SIZEOF_WCHAR_T==4 && U_PLATFORM_IS_LINUX_BASED)
# define U_WCHAR_IS_UTF32
# elif U_PLATFORM_HAS_WIN32_API
# define U_WCHAR_IS_UTF16
# endif
#endif
/* UChar and UChar32 definitions -------------------------------------------- */
/** Number of bytes in a UChar (always 2). @stable ICU 2.0 */
#define U_SIZEOF_UCHAR 2
/**
* \def U_CHAR16_IS_TYPEDEF
* If 1, then char16_t is a typedef and not a real type (yet)
* @internal
*/
#if defined(_MSC_VER) && (_MSC_VER < 1900)
// Versions of Visual Studio/MSVC below 2015 do not support char16_t as a real type,
// and instead use a typedef. https://msdn.microsoft.com/library/bb531344.aspx
# define U_CHAR16_IS_TYPEDEF 1
#else
# define U_CHAR16_IS_TYPEDEF 0
#endif
/**
* \var UChar
*
* The base type for UTF-16 code units and pointers.
* Unsigned 16-bit integer.
* Starting with ICU 59, C++ API uses char16_t directly, while C API continues to use UChar.
*
* UChar is configurable by defining the macro UCHAR_TYPE
* on the preprocessor or compiler command line:
* -DUCHAR_TYPE=uint16_t or -DUCHAR_TYPE=wchar_t (if U_SIZEOF_WCHAR_T==2) etc.
* (The UCHAR_TYPE can also be \#defined earlier in this file, for outside the ICU library code.)
* This is for transitional use from application code that uses uint16_t or wchar_t for UTF-16.
*
* The default is UChar=char16_t.
*
* C++11 defines char16_t as bit-compatible with uint16_t, but as a distinct type.
*
* In C, char16_t is a simple typedef of uint_least16_t.
* ICU requires uint_least16_t=uint16_t for data memory mapping.
* On macOS, char16_t is not available because the uchar.h standard header is missing.
*
* @stable ICU 4.4
*/
#if 1
// #if 1 is normal. UChar defaults to char16_t in C++.
// For configuration testing of UChar=uint16_t temporarily change this to #if 0.
#else
# define UCHAR_TYPE uint16_t
#endif
#if defined(U_ALL_IMPLEMENTATION) || !defined(UCHAR_TYPE)
typedef char16_t UChar;
#else
typedef UCHAR_TYPE UChar;
#endif
/**
* \var OldUChar
* Default ICU 58 definition of UChar.
* A base type for UTF-16 code units and pointers.
* Unsigned 16-bit integer.
*
* Define OldUChar to be wchar_t if that is 16 bits wide.
* If wchar_t is not 16 bits wide, then define UChar to be uint16_t.
*
* This makes the definition of OldUChar platform-dependent
* but allows direct string type compatibility with platforms with
* 16-bit wchar_t types.
*
* This is how UChar was defined in ICU 58, for transition convenience.
* Exception: ICU 58 UChar was defined to UCHAR_TYPE if that macro was defined.
* The current UChar responds to UCHAR_TYPE but OldUChar does not.
*
* @stable ICU 59
*/
#if U_SIZEOF_WCHAR_T==2
typedef wchar_t OldUChar;
#elif defined(__CHAR16_TYPE__)
typedef __CHAR16_TYPE__ OldUChar;
#else
typedef uint16_t OldUChar;
#endif
/**
* Define UChar32 as a type for single Unicode code points.
* UChar32 is a signed 32-bit integer (same as int32_t).
*
* The Unicode code point range is 0..0x10ffff.
* All other values (negative or >=0x110000) are illegal as Unicode code points.
* They may be used as sentinel values to indicate "done", "error"
* or similar non-code point conditions.
*
* Before ICU 2.4 (Jitterbug 2146), UChar32 was defined
* to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned)
* or else to be uint32_t.
* That is, the definition of UChar32 was platform-dependent.
*
* @see U_SENTINEL
* @stable ICU 2.4
*/
typedef int32_t UChar32;
/**
* This value is intended for sentinel values for APIs that
* (take or) return single code points (UChar32).
* It is outside of the Unicode code point range 0..0x10ffff.
*
* For example, a "done" or "error" value in a new API
* could be indicated with U_SENTINEL.
*
* ICU APIs designed before ICU 2.4 usually define service-specific "done"
* values, mostly 0xffff.
* Those may need to be distinguished from
* actual U+ffff text contents by calling functions like
* CharacterIterator::hasNext() or UnicodeString::length().
*
* @return -1
* @see UChar32
* @stable ICU 2.4
*/
#define U_SENTINEL (-1)
#include "unicode/urename.h"
#endif

62
thirdparty/icu4c/common/unicode/umisc.h vendored Normal file
View File

@@ -0,0 +1,62 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 1999-2006, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: umisc.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999oct15
* created by: Markus W. Scherer
*/
#ifndef UMISC_H
#define UMISC_H
#include "unicode/utypes.h"
/**
* \file
* \brief C API: Miscellaneous definitions
*
* This file contains miscellaneous definitions for the C APIs.
*/
U_CDECL_BEGIN
/** A struct representing a range of text containing a specific field
* @stable ICU 2.0
*/
typedef struct UFieldPosition {
/**
* The field
* @stable ICU 2.0
*/
int32_t field;
/**
* The start of the text range containing field
* @stable ICU 2.0
*/
int32_t beginIndex;
/**
* The limit of the text range containing field
* @stable ICU 2.0
*/
int32_t endIndex;
} UFieldPosition;
#if !UCONFIG_NO_SERVICE
/**
* Opaque type returned by registerInstance, registerFactory and unregister for service registration.
* @stable ICU 2.6
*/
typedef const void* URegistryKey;
#endif
U_CDECL_END
#endif

View File

@@ -0,0 +1,240 @@
// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
// umutablecptrie.h (split out of ucptrie.h)
// created: 2018jan24 Markus W. Scherer
#ifndef __UMUTABLECPTRIE_H__
#define __UMUTABLECPTRIE_H__
#include "unicode/utypes.h"
#include "unicode/ucpmap.h"
#include "unicode/ucptrie.h"
#include "unicode/utf8.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/localpointer.h"
#endif // U_SHOW_CPLUSPLUS_API
U_CDECL_BEGIN
/**
* \file
* \brief C API: This file defines a mutable Unicode code point trie.
*
* @see UCPTrie
* @see UMutableCPTrie
*/
/**
* Mutable Unicode code point trie.
* Fast map from Unicode code points (U+0000..U+10FFFF) to 32-bit integer values.
* For details see https://icu.unicode.org/design/struct/utrie
*
* Setting values (especially ranges) and lookup is fast.
* The mutable trie is only somewhat space-efficient.
* It builds a compacted, immutable UCPTrie.
*
* This trie can be modified while iterating over its contents.
* For example, it is possible to merge its values with those from another
* set of ranges (e.g., another mutable or immutable trie):
* Iterate over those source ranges; for each of them iterate over this trie;
* add the source value into the value of each trie range.
*
* @see UCPTrie
* @see umutablecptrie_buildImmutable
* @stable ICU 63
*/
typedef struct UMutableCPTrie UMutableCPTrie;
/**
* Creates a mutable trie that initially maps each Unicode code point to the same value.
* It uses 32-bit data values until umutablecptrie_buildImmutable() is called.
* umutablecptrie_buildImmutable() takes a valueWidth parameter which
* determines the number of bits in the data value in the resulting UCPTrie.
* You must umutablecptrie_close() the trie once you are done using it.
*
* @param initialValue the initial value that is set for all code points
* @param errorValue the value for out-of-range code points and ill-formed UTF-8/16
* @param pErrorCode an in/out ICU UErrorCode
* @return the trie
* @stable ICU 63
*/
U_CAPI UMutableCPTrie * U_EXPORT2
umutablecptrie_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode);
/**
* Clones a mutable trie.
* You must umutablecptrie_close() the clone once you are done using it.
*
* @param other the trie to clone
* @param pErrorCode an in/out ICU UErrorCode
* @return the trie clone
* @stable ICU 63
*/
U_CAPI UMutableCPTrie * U_EXPORT2
umutablecptrie_clone(const UMutableCPTrie *other, UErrorCode *pErrorCode);
/**
* Closes a mutable trie and releases associated memory.
*
* @param trie the trie
* @stable ICU 63
*/
U_CAPI void U_EXPORT2
umutablecptrie_close(UMutableCPTrie *trie);
/**
* Creates a mutable trie with the same contents as the UCPMap.
* You must umutablecptrie_close() the mutable trie once you are done using it.
*
* @param map the source map
* @param pErrorCode an in/out ICU UErrorCode
* @return the mutable trie
* @stable ICU 63
*/
U_CAPI UMutableCPTrie * U_EXPORT2
umutablecptrie_fromUCPMap(const UCPMap *map, UErrorCode *pErrorCode);
/**
* Creates a mutable trie with the same contents as the immutable one.
* You must umutablecptrie_close() the mutable trie once you are done using it.
*
* @param trie the immutable trie
* @param pErrorCode an in/out ICU UErrorCode
* @return the mutable trie
* @stable ICU 63
*/
U_CAPI UMutableCPTrie * U_EXPORT2
umutablecptrie_fromUCPTrie(const UCPTrie *trie, UErrorCode *pErrorCode);
/**
* Returns the value for a code point as stored in the trie.
*
* @param trie the trie
* @param c the code point
* @return the value
* @stable ICU 63
*/
U_CAPI uint32_t U_EXPORT2
umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c);
/**
* Returns the last code point such that all those from start to there have the same value.
* Can be used to efficiently iterate over all same-value ranges in a trie.
* (This is normally faster than iterating over code points and get()ting each value,
* but much slower than a data structure that stores ranges directly.)
*
* The trie can be modified between calls to this function.
*
* If the UCPMapValueFilter function pointer is not NULL, then
* the value to be delivered is passed through that function, and the return value is the end
* of the range where all values are modified to the same actual value.
* The value is unchanged if that function pointer is NULL.
*
* See the same-signature ucptrie_getRange() for a code sample.
*
* @param trie the trie
* @param start range start
* @param option defines whether surrogates are treated normally,
* or as having the surrogateValue; usually UCPMAP_RANGE_NORMAL
* @param surrogateValue value for surrogates; ignored if option==UCPMAP_RANGE_NORMAL
* @param filter a pointer to a function that may modify the trie data value,
* or NULL if the values from the trie are to be used unmodified
* @param context an opaque pointer that is passed on to the filter function
* @param pValue if not NULL, receives the value that every code point start..end has;
* may have been modified by filter(context, trie value)
* if that function pointer is not NULL
* @return the range end code point, or -1 if start is not a valid code point
* @stable ICU 63
*/
U_CAPI UChar32 U_EXPORT2
umutablecptrie_getRange(const UMutableCPTrie *trie, UChar32 start,
UCPMapRangeOption option, uint32_t surrogateValue,
UCPMapValueFilter *filter, const void *context, uint32_t *pValue);
/**
* Sets a value for a code point.
*
* @param trie the trie
* @param c the code point
* @param value the value
* @param pErrorCode an in/out ICU UErrorCode
* @stable ICU 63
*/
U_CAPI void U_EXPORT2
umutablecptrie_set(UMutableCPTrie *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode);
/**
* Sets a value for each code point [start..end].
* Faster and more space-efficient than setting the value for each code point separately.
*
* @param trie the trie
* @param start the first code point to get the value
* @param end the last code point to get the value (inclusive)
* @param value the value
* @param pErrorCode an in/out ICU UErrorCode
* @stable ICU 63
*/
U_CAPI void U_EXPORT2
umutablecptrie_setRange(UMutableCPTrie *trie,
UChar32 start, UChar32 end,
uint32_t value, UErrorCode *pErrorCode);
/**
* Compacts the data and builds an immutable UCPTrie according to the parameters.
* After this, the mutable trie will be empty.
*
* The mutable trie stores 32-bit values until buildImmutable() is called.
* If values shorter than 32 bits are to be stored in the immutable trie,
* then the upper bits are discarded.
* For example, when the mutable trie contains values 0x81, -0x7f, and 0xa581,
* and the value width is 8 bits, then each of these is stored as 0x81
* and the immutable trie will return that as an unsigned value.
* (Some implementations may want to make productive temporary use of the upper bits
* until buildImmutable() discards them.)
*
* Not every possible set of mappings can be built into a UCPTrie,
* because of limitations resulting from speed and space optimizations.
* Every Unicode assigned character can be mapped to a unique value.
* Typical data yields data structures far smaller than the limitations.
*
* It is possible to construct extremely unusual mappings that exceed the data structure limits.
* In such a case this function will fail with a U_INDEX_OUTOFBOUNDS_ERROR.
*
* @param trie the trie trie
* @param type selects the trie type
* @param valueWidth selects the number of bits in a trie data value; if smaller than 32 bits,
* then the values stored in the trie will be truncated first
* @param pErrorCode an in/out ICU UErrorCode
*
* @see umutablecptrie_fromUCPTrie
* @stable ICU 63
*/
U_CAPI UCPTrie * U_EXPORT2
umutablecptrie_buildImmutable(UMutableCPTrie *trie, UCPTrieType type, UCPTrieValueWidth valueWidth,
UErrorCode *pErrorCode);
U_CDECL_END
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalUMutableCPTriePointer
* "Smart pointer" class, closes a UMutableCPTrie via umutablecptrie_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @stable ICU 63
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalUMutableCPTriePointer, UMutableCPTrie, umutablecptrie_close);
U_NAMESPACE_END
#endif
#endif

View File

@@ -0,0 +1,136 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 1999-2010, International Business Machines Corporation and others.
* All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/17/99 aliu Creation.
**********************************************************************
*/
#ifndef UNIFILT_H
#define UNIFILT_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/unifunct.h"
#include "unicode/unimatch.h"
/**
* \file
* \brief C++ API: Unicode Filter
*/
U_NAMESPACE_BEGIN
/**
* U_ETHER is used to represent character values for positions outside
* a range. For example, transliterator uses this to represent
* characters outside the range contextStart..contextLimit-1. This
* allows explicit matching by rules and UnicodeSets of text outside a
* defined range.
* @stable ICU 3.0
*/
#define U_ETHER ((char16_t)0xFFFF)
/**
*
* <code>UnicodeFilter</code> defines a protocol for selecting a
* subset of the full range (U+0000 to U+10FFFF) of Unicode characters.
* Currently, filters are used in conjunction with classes like
* {@link Transliterator} to only process selected characters through a
* transformation.
*
* <p>Note: UnicodeFilter currently stubs out two pure virtual methods
* of its base class, UnicodeMatcher. These methods are toPattern()
* and matchesIndexValue(). This is done so that filter classes that
* are not actually used as matchers -- specifically, those in the
* UnicodeFilterLogic component, and those in tests -- can continue to
* work without defining these methods. As long as a filter is not
* used in an RBT during real transliteration, these methods will not
* be called. However, this breaks the UnicodeMatcher base class
* protocol, and it is not a correct solution.
*
* <p>In the future we may revisit the UnicodeMatcher / UnicodeFilter
* hierarchy and either redesign it, or simply remove the stubs in
* UnicodeFilter and force subclasses to implement the full
* UnicodeMatcher protocol.
*
* @see UnicodeFilterLogic
* @stable ICU 2.0
*/
class U_COMMON_API UnicodeFilter : public UnicodeFunctor, public UnicodeMatcher {
public:
/**
* Destructor
* @stable ICU 2.0
*/
virtual ~UnicodeFilter();
/**
* Clones this object polymorphically.
* The caller owns the result and should delete it when done.
* @return clone, or nullptr if an error occurred
* @stable ICU 2.4
*/
virtual UnicodeFilter* clone() const override = 0;
/**
* Returns <tt>true</tt> for characters that are in the selected
* subset. In other words, if a character is <b>to be
* filtered</b>, then <tt>contains()</tt> returns
* <b><tt>false</tt></b>.
* @stable ICU 2.0
*/
virtual UBool contains(UChar32 c) const = 0;
/**
* UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
* and return the pointer.
* @stable ICU 2.4
*/
virtual UnicodeMatcher* toMatcher() const override;
/**
* Implement UnicodeMatcher API.
* @stable ICU 2.4
*/
virtual UMatchDegree matches(const Replaceable& text,
int32_t& offset,
int32_t limit,
UBool incremental) override;
/**
* UnicodeFunctor API. Nothing to do.
* @stable ICU 2.4
*/
virtual void setData(const TransliterationRuleData*) override;
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*
* @stable ICU 2.2
*/
static UClassID U_EXPORT2 getStaticClassID();
protected:
/*
* Since this class has pure virtual functions,
* a constructor can't be used.
* @stable ICU 2.0
*/
/* UnicodeFilter();*/
};
/*inline UnicodeFilter::UnicodeFilter() {}*/
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif

View File

@@ -0,0 +1,132 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (c) 2002-2005, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 01/14/2002 aliu Creation.
**********************************************************************
*/
#ifndef UNIFUNCT_H
#define UNIFUNCT_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/uobject.h"
/**
* \file
* \brief C++ API: Unicode Functor
*/
U_NAMESPACE_BEGIN
class UnicodeMatcher;
class UnicodeReplacer;
class TransliterationRuleData;
/**
* <code>UnicodeFunctor</code> is an abstract base class for objects
* that perform match and/or replace operations on Unicode strings.
* @author Alan Liu
* @stable ICU 2.4
*/
class U_COMMON_API UnicodeFunctor : public UObject {
public:
/**
* Destructor
* @stable ICU 2.4
*/
virtual ~UnicodeFunctor();
/**
* Return a copy of this object. All UnicodeFunctor objects
* have to support cloning in order to allow classes using
* UnicodeFunctor to implement cloning.
* @stable ICU 2.4
*/
virtual UnicodeFunctor* clone() const = 0;
/**
* Cast 'this' to a UnicodeMatcher* pointer and return the
* pointer, or null if this is not a UnicodeMatcher*. Subclasses
* that mix in UnicodeMatcher as a base class must override this.
* This protocol is required because a pointer to a UnicodeFunctor
* cannot be cast to a pointer to a UnicodeMatcher, since
* UnicodeMatcher is a mixin that does not derive from
* UnicodeFunctor.
* @stable ICU 2.4
*/
virtual UnicodeMatcher* toMatcher() const;
/**
* Cast 'this' to a UnicodeReplacer* pointer and return the
* pointer, or null if this is not a UnicodeReplacer*. Subclasses
* that mix in UnicodeReplacer as a base class must override this.
* This protocol is required because a pointer to a UnicodeFunctor
* cannot be cast to a pointer to a UnicodeReplacer, since
* UnicodeReplacer is a mixin that does not derive from
* UnicodeFunctor.
* @stable ICU 2.4
*/
virtual UnicodeReplacer* toReplacer() const;
/**
* Return the class ID for this class. This is useful only for
* comparing to a return value from getDynamicClassID().
* @return The class ID for all objects of this class.
* @stable ICU 2.0
*/
static UClassID U_EXPORT2 getStaticClassID();
/**
* Returns a unique class ID <b>polymorphically</b>. This method
* is to implement a simple version of RTTI, since not all C++
* compilers support genuine RTTI. Polymorphic operator==() and
* clone() methods call this method.
*
* <p>Concrete subclasses of UnicodeFunctor should use the macro
* UOBJECT_DEFINE_RTTI_IMPLEMENTATION from uobject.h to
* provide definitions getStaticClassID and getDynamicClassID.
*
* @return The class ID for this object. All objects of a given
* class have the same class ID. Objects of other classes have
* different class IDs.
* @stable ICU 2.4
*/
virtual UClassID getDynamicClassID() const override = 0;
/**
* Set the data object associated with this functor. The data
* object provides context for functor-to-standin mapping. This
* method is required when assigning a functor to a different data
* object. This function MAY GO AWAY later if the architecture is
* changed to pass data object pointers through the API.
* @internal ICU 2.1
*/
virtual void setData(const TransliterationRuleData*) = 0;
protected:
/**
* Since this class has pure virtual functions,
* a constructor can't be used.
* @stable ICU 2.0
*/
/*UnicodeFunctor();*/
};
/*inline UnicodeFunctor::UnicodeFunctor() {}*/
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif

View File

@@ -0,0 +1,168 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
* Copyright (C) 2001-2005, International Business Machines Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 07/18/01 aliu Creation.
**********************************************************************
*/
#ifndef UNIMATCH_H
#define UNIMATCH_H
#include "unicode/utypes.h"
/**
* \file
* \brief C++ API: Unicode Matcher
*/
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
class Replaceable;
class UnicodeString;
class UnicodeSet;
/**
* Constants returned by <code>UnicodeMatcher::matches()</code>
* indicating the degree of match.
* @stable ICU 2.4
*/
enum UMatchDegree {
/**
* Constant returned by <code>matches()</code> indicating a
* mismatch between the text and this matcher. The text contains
* a character which does not match, or the text does not contain
* all desired characters for a non-incremental match.
* @stable ICU 2.4
*/
U_MISMATCH,
/**
* Constant returned by <code>matches()</code> indicating a
* partial match between the text and this matcher. This value is
* only returned for incremental match operations. All characters
* of the text match, but more characters are required for a
* complete match. Alternatively, for variable-length matchers,
* all characters of the text match, and if more characters were
* supplied at limit, they might also match.
* @stable ICU 2.4
*/
U_PARTIAL_MATCH,
/**
* Constant returned by <code>matches()</code> indicating a
* complete match between the text and this matcher. For an
* incremental variable-length match, this value is returned if
* the given text matches, and it is known that additional
* characters would not alter the extent of the match.
* @stable ICU 2.4
*/
U_MATCH
};
/**
* <code>UnicodeMatcher</code> defines a protocol for objects that can
* match a range of characters in a Replaceable string.
* @stable ICU 2.4
*/
class U_COMMON_API UnicodeMatcher /* not : public UObject because this is an interface/mixin class */ {
public:
/**
* Destructor.
* @stable ICU 2.4
*/
virtual ~UnicodeMatcher();
/**
* Return a UMatchDegree value indicating the degree of match for
* the given text at the given offset. Zero, one, or more
* characters may be matched.
*
* Matching in the forward direction is indicated by limit >
* offset. Characters from offset forwards to limit-1 will be
* considered for matching.
*
* Matching in the reverse direction is indicated by limit <
* offset. Characters from offset backwards to limit+1 will be
* considered for matching.
*
* If limit == offset then the only match possible is a zero
* character match (which subclasses may implement if desired).
*
* As a side effect, advance the offset parameter to the limit of
* the matched substring. In the forward direction, this will be
* the index of the last matched character plus one. In the
* reverse direction, this will be the index of the last matched
* character minus one.
*
* <p>Note: This method is not const because some classes may
* modify their state as the result of a match.
*
* @param text the text to be matched
* @param offset on input, the index into text at which to begin
* matching. On output, the limit of the matched text. The
* number of matched characters is the output value of offset
* minus the input value. Offset should always point to the
* HIGH SURROGATE (leading code unit) of a pair of surrogates,
* both on entry and upon return.
* @param limit the limit index of text to be matched. Greater
* than offset for a forward direction match, less than offset for
* a backward direction match. The last character to be
* considered for matching will be text.charAt(limit-1) in the
* forward direction or text.charAt(limit+1) in the backward
* direction.
* @param incremental if true, then assume further characters may
* be inserted at limit and check for partial matching. Otherwise
* assume the text as given is complete.
* @return a match degree value indicating a full match, a partial
* match, or a mismatch. If incremental is false then
* U_PARTIAL_MATCH should never be returned.
* @stable ICU 2.4
*/
virtual UMatchDegree matches(const Replaceable& text,
int32_t& offset,
int32_t limit,
UBool incremental) = 0;
/**
* Returns a string representation of this matcher. If the result of
* calling this function is passed to the appropriate parser, it
* will produce another matcher that is equal to this one.
* @param result the string to receive the pattern. Previous
* contents will be deleted.
* @param escapeUnprintable if true then convert unprintable
* character to their hex escape representations, \\uxxxx or
* \\Uxxxxxxxx. Unprintable characters are those other than
* U+000A, U+0020..U+007E.
* @stable ICU 2.4
*/
virtual UnicodeString& toPattern(UnicodeString& result,
UBool escapeUnprintable = false) const = 0;
/**
* Returns true if this matcher will match a character c, where c
* & 0xFF == v, at offset, in the forward direction (with limit >
* offset). This is used by <tt>RuleBasedTransliterator</tt> for
* indexing.
* @stable ICU 2.4
*/
virtual UBool matchesIndexValue(uint8_t v) const = 0;
/**
* Union the set of all characters that may be matched by this object
* into the given set.
* @param toUnionTo the set into which to union the source characters
* @stable ICU 2.4
*/
virtual void addMatchSetTo(UnicodeSet& toUnionTo) const = 0;
};
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif

1908
thirdparty/icu4c/common/unicode/uniset.h vendored Normal file

File diff suppressed because it is too large Load Diff

5095
thirdparty/icu4c/common/unicode/unistr.h vendored Normal file

File diff suppressed because it is too large Load Diff

476
thirdparty/icu4c/common/unicode/unorm.h vendored Normal file
View File

@@ -0,0 +1,476 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (c) 1996-2016, International Business Machines Corporation
* and others. All Rights Reserved.
*******************************************************************************
* File unorm.h
*
* Created by: Vladimir Weinstein 12052000
*
* Modification history :
*
* Date Name Description
* 02/01/01 synwee Added normalization quickcheck enum and method.
*/
#ifndef UNORM_H
#define UNORM_H
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/uiter.h"
#include "unicode/unorm2.h"
/**
* \file
* \brief C API: Unicode Normalization
*
* Old Unicode normalization API.
*
* This API has been replaced by the unorm2.h API and is only available
* for backward compatibility. The functions here simply delegate to the
* unorm2.h functions, for example unorm2_getInstance() and unorm2_normalize().
* There is one exception: The new API does not provide a replacement for unorm_compare().
* Its declaration has been moved to unorm2.h.
*
* <code>unorm_normalize</code> transforms Unicode text into an equivalent composed or
* decomposed form, allowing for easier sorting and searching of text.
* <code>unorm_normalize</code> supports the standard normalization forms described in
* <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
* Unicode Standard Annex #15: Unicode Normalization Forms</a>.
*
* Characters with accents or other adornments can be encoded in
* several different ways in Unicode. For example, take the character A-acute.
* In Unicode, this can be encoded as a single character (the
* "composed" form):
*
* \code
* 00C1 LATIN CAPITAL LETTER A WITH ACUTE
* \endcode
*
* or as two separate characters (the "decomposed" form):
*
* \code
* 0041 LATIN CAPITAL LETTER A
* 0301 COMBINING ACUTE ACCENT
* \endcode
*
* To a user of your program, however, both of these sequences should be
* treated as the same "user-level" character "A with acute accent". When you are searching or
* comparing text, you must ensure that these two sequences are treated
* equivalently. In addition, you must handle characters with more than one
* accent. Sometimes the order of a character's combining accents is
* significant, while in other cases accent sequences in different orders are
* really equivalent.
*
* Similarly, the string "ffi" can be encoded as three separate letters:
*
* \code
* 0066 LATIN SMALL LETTER F
* 0066 LATIN SMALL LETTER F
* 0069 LATIN SMALL LETTER I
* \endcode
*
* or as the single character
*
* \code
* FB03 LATIN SMALL LIGATURE FFI
* \endcode
*
* The ffi ligature is not a distinct semantic character, and strictly speaking
* it shouldn't be in Unicode at all, but it was included for compatibility
* with existing character sets that already provided it. The Unicode standard
* identifies such characters by giving them "compatibility" decompositions
* into the corresponding semantic characters. When sorting and searching, you
* will often want to use these mappings.
*
* <code>unorm_normalize</code> helps solve these problems by transforming text into the
* canonical composed and decomposed forms as shown in the first example above.
* In addition, you can have it perform compatibility decompositions so that
* you can treat compatibility characters the same as their equivalents.
* Finally, <code>unorm_normalize</code> rearranges accents into the proper canonical
* order, so that you do not have to worry about accent rearrangement on your
* own.
*
* Form FCD, "Fast C or D", is also designed for collation.
* It allows to work on strings that are not necessarily normalized
* with an algorithm (like in collation) that works under "canonical closure", i.e., it treats precomposed
* characters and their decomposed equivalents the same.
*
* It is not a normalization form because it does not provide for uniqueness of representation. Multiple strings
* may be canonically equivalent (their NFDs are identical) and may all conform to FCD without being identical
* themselves.
*
* The form is defined such that the "raw decomposition", the recursive canonical decomposition of each character,
* results in a string that is canonically ordered. This means that precomposed characters are allowed for as long
* as their decompositions do not need canonical reordering.
*
* Its advantage for a process like collation is that all NFD and most NFC texts - and many unnormalized texts -
* already conform to FCD and do not need to be normalized (NFD) for such a process. The FCD quick check will
* return UNORM_YES for most strings in practice.
*
* unorm_normalize(UNORM_FCD) may be implemented with UNORM_NFD.
*
* For more details on FCD see the collation design document:
* https://htmlpreview.github.io/?https://github.com/unicode-org/icu-docs/blob/main/design/collation/ICU_collation_design.htm
*
* ICU collation performs either NFD or FCD normalization automatically if normalization
* is turned on for the collator object.
* Beyond collation and string search, normalized strings may be useful for string equivalence comparisons,
* transliteration/transcription, unique representations, etc.
*
* The W3C generally recommends to exchange texts in NFC.
* Note also that most legacy character encodings use only precomposed forms and often do not
* encode any combining marks by themselves. For conversion to such character encodings the
* Unicode text needs to be normalized to NFC.
* For more usage examples, see the Unicode Standard Annex.
*/
// Do not conditionalize the following enum with #ifndef U_HIDE_DEPRECATED_API,
// it is needed for layout of Normalizer object.
#ifndef U_FORCE_HIDE_DEPRECATED_API
/**
* Constants for normalization modes.
* @deprecated ICU 56 Use unorm2.h instead.
*/
typedef enum {
/** No decomposition/composition. @deprecated ICU 56 Use unorm2.h instead. */
UNORM_NONE = 1,
/** Canonical decomposition. @deprecated ICU 56 Use unorm2.h instead. */
UNORM_NFD = 2,
/** Compatibility decomposition. @deprecated ICU 56 Use unorm2.h instead. */
UNORM_NFKD = 3,
/** Canonical decomposition followed by canonical composition. @deprecated ICU 56 Use unorm2.h instead. */
UNORM_NFC = 4,
/** Default normalization. @deprecated ICU 56 Use unorm2.h instead. */
UNORM_DEFAULT = UNORM_NFC,
/** Compatibility decomposition followed by canonical composition. @deprecated ICU 56 Use unorm2.h instead. */
UNORM_NFKC =5,
/** "Fast C or D" form. @deprecated ICU 56 Use unorm2.h instead. */
UNORM_FCD = 6,
/** One more than the highest normalization mode constant. @deprecated ICU 56 Use unorm2.h instead. */
UNORM_MODE_COUNT
} UNormalizationMode;
#endif // U_FORCE_HIDE_DEPRECATED_API
#ifndef U_HIDE_DEPRECATED_API
/**
* Constants for options flags for normalization.
* Use 0 for default options,
* including normalization according to the Unicode version
* that is currently supported by ICU (see u_getUnicodeVersion).
* @deprecated ICU 56 Use unorm2.h instead.
*/
enum {
/**
* Options bit set value to select Unicode 3.2 normalization
* (except NormalizationCorrections).
* At most one Unicode version can be selected at a time.
* @deprecated ICU 56 Use unorm2.h instead.
*/
UNORM_UNICODE_3_2=0x20
};
/**
* Lowest-order bit number of unorm_compare() options bits corresponding to
* normalization options bits.
*
* The options parameter for unorm_compare() uses most bits for
* itself and for various comparison and folding flags.
* The most significant bits, however, are shifted down and passed on
* to the normalization implementation.
* (That is, from unorm_compare(..., options, ...),
* options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT will be passed on to the
* internal normalization functions.)
*
* @see unorm_compare
* @deprecated ICU 56 Use unorm2.h instead.
*/
#define UNORM_COMPARE_NORM_OPTIONS_SHIFT 20
/**
* Normalize a string.
* The string will be normalized according the specified normalization mode
* and options.
* The source and result buffers must not be the same, nor overlap.
*
* @param source The string to normalize.
* @param sourceLength The length of source, or -1 if NUL-terminated.
* @param mode The normalization mode; one of UNORM_NONE,
* UNORM_NFD, UNORM_NFC, UNORM_NFKC, UNORM_NFKD, UNORM_DEFAULT.
* @param options The normalization options, ORed together (0 for no options).
* @param result A pointer to a buffer to receive the result string.
* The result string is NUL-terminated if possible.
* @param resultLength The maximum size of result.
* @param status A pointer to a UErrorCode to receive any errors.
* @return The total buffer size needed; if greater than resultLength,
* the output was truncated, and the error code is set to U_BUFFER_OVERFLOW_ERROR.
* @deprecated ICU 56 Use unorm2.h instead.
*/
U_DEPRECATED int32_t U_EXPORT2
unorm_normalize(const UChar *source, int32_t sourceLength,
UNormalizationMode mode, int32_t options,
UChar *result, int32_t resultLength,
UErrorCode *status);
/**
* Performing quick check on a string, to quickly determine if the string is
* in a particular normalization format.
* Three types of result can be returned UNORM_YES, UNORM_NO or
* UNORM_MAYBE. Result UNORM_YES indicates that the argument
* string is in the desired normalized format, UNORM_NO determines that
* argument string is not in the desired normalized format. A
* UNORM_MAYBE result indicates that a more thorough check is required,
* the user may have to put the string in its normalized form and compare the
* results.
*
* @param source string for determining if it is in a normalized format
* @param sourcelength length of source to test, or -1 if NUL-terminated
* @param mode which normalization form to test for
* @param status a pointer to a UErrorCode to receive any errors
* @return UNORM_YES, UNORM_NO or UNORM_MAYBE
*
* @see unorm_isNormalized
* @deprecated ICU 56 Use unorm2.h instead.
*/
U_DEPRECATED UNormalizationCheckResult U_EXPORT2
unorm_quickCheck(const UChar *source, int32_t sourcelength,
UNormalizationMode mode,
UErrorCode *status);
/**
* Performing quick check on a string; same as unorm_quickCheck but
* takes an extra options parameter like most normalization functions.
*
* @param src String that is to be tested if it is in a normalization format.
* @param srcLength Length of source to test, or -1 if NUL-terminated.
* @param mode Which normalization form to test for.
* @param options The normalization options, ORed together (0 for no options).
* @param pErrorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return UNORM_YES, UNORM_NO or UNORM_MAYBE
*
* @see unorm_quickCheck
* @see unorm_isNormalized
* @deprecated ICU 56 Use unorm2.h instead.
*/
U_DEPRECATED UNormalizationCheckResult U_EXPORT2
unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
UNormalizationMode mode, int32_t options,
UErrorCode *pErrorCode);
/**
* Test if a string is in a given normalization form.
* This is semantically equivalent to source.equals(normalize(source, mode)) .
*
* Unlike unorm_quickCheck(), this function returns a definitive result,
* never a "maybe".
* For NFD, NFKD, and FCD, both functions work exactly the same.
* For NFC and NFKC where quickCheck may return "maybe", this function will
* perform further tests to arrive at a true/false result.
*
* @param src String that is to be tested if it is in a normalization format.
* @param srcLength Length of source to test, or -1 if NUL-terminated.
* @param mode Which normalization form to test for.
* @param pErrorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return Boolean value indicating whether the source string is in the
* "mode" normalization form.
*
* @see unorm_quickCheck
* @deprecated ICU 56 Use unorm2.h instead.
*/
U_DEPRECATED UBool U_EXPORT2
unorm_isNormalized(const UChar *src, int32_t srcLength,
UNormalizationMode mode,
UErrorCode *pErrorCode);
/**
* Test if a string is in a given normalization form; same as unorm_isNormalized but
* takes an extra options parameter like most normalization functions.
*
* @param src String that is to be tested if it is in a normalization format.
* @param srcLength Length of source to test, or -1 if NUL-terminated.
* @param mode Which normalization form to test for.
* @param options The normalization options, ORed together (0 for no options).
* @param pErrorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return Boolean value indicating whether the source string is in the
* "mode/options" normalization form.
*
* @see unorm_quickCheck
* @see unorm_isNormalized
* @deprecated ICU 56 Use unorm2.h instead.
*/
U_DEPRECATED UBool U_EXPORT2
unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
UNormalizationMode mode, int32_t options,
UErrorCode *pErrorCode);
/**
* Iterative normalization forward.
* This function (together with unorm_previous) is somewhat
* similar to the C++ Normalizer class (see its non-static functions).
*
* Iterative normalization is useful when only a small portion of a longer
* string/text needs to be processed.
*
* For example, the likelihood may be high that processing the first 10% of some
* text will be sufficient to find certain data.
* Another example: When one wants to concatenate two normalized strings and get a
* normalized result, it is much more efficient to normalize just a small part of
* the result around the concatenation place instead of re-normalizing everything.
*
* The input text is an instance of the C character iteration API UCharIterator.
* It may wrap around a simple string, a CharacterIterator, a Replaceable, or any
* other kind of text object.
*
* If a buffer overflow occurs, then the caller needs to reset the iterator to the
* old index and call the function again with a larger buffer - if the caller cares
* for the actual output.
* Regardless of the output buffer, the iterator will always be moved to the next
* normalization boundary.
*
* This function (like unorm_previous) serves two purposes:
*
* 1) To find the next boundary so that the normalization of the part of the text
* from the current position to that boundary does not affect and is not affected
* by the part of the text beyond that boundary.
*
* 2) To normalize the text up to the boundary.
*
* The second step is optional, per the doNormalize parameter.
* It is omitted for operations like string concatenation, where the two adjacent
* string ends need to be normalized together.
* In such a case, the output buffer will just contain a copy of the text up to the
* boundary.
*
* pNeededToNormalize is an output-only parameter. Its output value is only defined
* if normalization was requested (doNormalize) and successful (especially, no
* buffer overflow).
* It is useful for operations like a normalizing transliterator, where one would
* not want to replace a piece of text if it is not modified.
*
* If doNormalize==true and pNeededToNormalize!=NULL then *pNeeded... is set true
* if the normalization was necessary.
*
* If doNormalize==false then *pNeededToNormalize will be set to false.
*
* If the buffer overflows, then *pNeededToNormalize will be undefined;
* essentially, whenever U_FAILURE is true (like in buffer overflows), this result
* will be undefined.
*
* @param src The input text in the form of a C character iterator.
* @param dest The output buffer; can be NULL if destCapacity==0 for pure preflighting.
* @param destCapacity The number of UChars that fit into dest.
* @param mode The normalization mode.
* @param options The normalization options, ORed together (0 for no options).
* @param doNormalize Indicates if the source text up to the next boundary
* is to be normalized (true) or just copied (false).
* @param pNeededToNormalize Output flag indicating if the normalization resulted in
* different text from the input.
* Not defined if an error occurs including buffer overflow.
* Always false if !doNormalize.
* @param pErrorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return Length of output (number of UChars) when successful or buffer overflow.
*
* @see unorm_previous
* @see unorm_normalize
*
* @deprecated ICU 56 Use unorm2.h instead.
*/
U_DEPRECATED int32_t U_EXPORT2
unorm_next(UCharIterator *src,
UChar *dest, int32_t destCapacity,
UNormalizationMode mode, int32_t options,
UBool doNormalize, UBool *pNeededToNormalize,
UErrorCode *pErrorCode);
/**
* Iterative normalization backward.
* This function (together with unorm_next) is somewhat
* similar to the C++ Normalizer class (see its non-static functions).
* For all details see unorm_next.
*
* @param src The input text in the form of a C character iterator.
* @param dest The output buffer; can be NULL if destCapacity==0 for pure preflighting.
* @param destCapacity The number of UChars that fit into dest.
* @param mode The normalization mode.
* @param options The normalization options, ORed together (0 for no options).
* @param doNormalize Indicates if the source text up to the next boundary
* is to be normalized (true) or just copied (false).
* @param pNeededToNormalize Output flag indicating if the normalization resulted in
* different text from the input.
* Not defined if an error occurs including buffer overflow.
* Always false if !doNormalize.
* @param pErrorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return Length of output (number of UChars) when successful or buffer overflow.
*
* @see unorm_next
* @see unorm_normalize
*
* @deprecated ICU 56 Use unorm2.h instead.
*/
U_DEPRECATED int32_t U_EXPORT2
unorm_previous(UCharIterator *src,
UChar *dest, int32_t destCapacity,
UNormalizationMode mode, int32_t options,
UBool doNormalize, UBool *pNeededToNormalize,
UErrorCode *pErrorCode);
/**
* Concatenate normalized strings, making sure that the result is normalized as well.
*
* If both the left and the right strings are in
* the normalization form according to "mode/options",
* then the result will be
*
* \code
* dest=normalize(left+right, mode, options)
* \endcode
*
* With the input strings already being normalized,
* this function will use unorm_next() and unorm_previous()
* to find the adjacent end pieces of the input strings.
* Only the concatenation of these end pieces will be normalized and
* then concatenated with the remaining parts of the input strings.
*
* It is allowed to have dest==left to avoid copying the entire left string.
*
* @param left Left source string, may be same as dest.
* @param leftLength Length of left source string, or -1 if NUL-terminated.
* @param right Right source string. Must not be the same as dest, nor overlap.
* @param rightLength Length of right source string, or -1 if NUL-terminated.
* @param dest The output buffer; can be NULL if destCapacity==0 for pure preflighting.
* @param destCapacity The number of UChars that fit into dest.
* @param mode The normalization mode.
* @param options The normalization options, ORed together (0 for no options).
* @param pErrorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return Length of output (number of UChars) when successful or buffer overflow.
*
* @see unorm_normalize
* @see unorm_next
* @see unorm_previous
*
* @deprecated ICU 56 Use unorm2.h instead.
*/
U_DEPRECATED int32_t U_EXPORT2
unorm_concatenate(const UChar *left, int32_t leftLength,
const UChar *right, int32_t rightLength,
UChar *dest, int32_t destCapacity,
UNormalizationMode mode, int32_t options,
UErrorCode *pErrorCode);
#endif /* U_HIDE_DEPRECATED_API */
#endif /* #if !UCONFIG_NO_NORMALIZATION */
#endif

626
thirdparty/icu4c/common/unicode/unorm2.h vendored Normal file
View File

@@ -0,0 +1,626 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2009-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: unorm2.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009dec15
* created by: Markus W. Scherer
*/
#ifndef __UNORM2_H__
#define __UNORM2_H__
/**
* \file
* \brief C API: New API for Unicode Normalization.
*
* Unicode normalization functionality for standard Unicode normalization or
* for using custom mapping tables.
* All instances of UNormalizer2 are unmodifiable/immutable.
* Instances returned by unorm2_getInstance() are singletons that must not be deleted by the caller.
* For more details see the Normalizer2 C++ class.
*/
#include "unicode/utypes.h"
#include "unicode/stringoptions.h"
#include "unicode/uset.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/localpointer.h"
#endif // U_SHOW_CPLUSPLUS_API
/**
* Constants for normalization modes.
* For details about standard Unicode normalization forms
* and about the algorithms which are also used with custom mapping tables
* see http://www.unicode.org/unicode/reports/tr15/
* @stable ICU 4.4
*/
typedef enum {
/**
* Decomposition followed by composition.
* Same as standard NFC when using an "nfc" instance.
* Same as standard NFKC when using an "nfkc" instance.
* For details about standard Unicode normalization forms
* see http://www.unicode.org/unicode/reports/tr15/
* @stable ICU 4.4
*/
UNORM2_COMPOSE,
/**
* Map, and reorder canonically.
* Same as standard NFD when using an "nfc" instance.
* Same as standard NFKD when using an "nfkc" instance.
* For details about standard Unicode normalization forms
* see http://www.unicode.org/unicode/reports/tr15/
* @stable ICU 4.4
*/
UNORM2_DECOMPOSE,
/**
* "Fast C or D" form.
* If a string is in this form, then further decomposition <i>without reordering</i>
* would yield the same form as DECOMPOSE.
* Text in "Fast C or D" form can be processed efficiently with data tables
* that are "canonically closed", that is, that provide equivalent data for
* equivalent text, without having to be fully normalized.
* Not a standard Unicode normalization form.
* Not a unique form: Different FCD strings can be canonically equivalent.
* For details see http://www.unicode.org/notes/tn5/#FCD
* @stable ICU 4.4
*/
UNORM2_FCD,
/**
* Compose only contiguously.
* Also known as "FCC" or "Fast C Contiguous".
* The result will often but not always be in NFC.
* The result will conform to FCD which is useful for processing.
* Not a standard Unicode normalization form.
* For details see http://www.unicode.org/notes/tn5/#FCC
* @stable ICU 4.4
*/
UNORM2_COMPOSE_CONTIGUOUS
} UNormalization2Mode;
/**
* Result values for normalization quick check functions.
* For details see http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
* @stable ICU 2.0
*/
typedef enum UNormalizationCheckResult {
/**
* The input string is not in the normalization form.
* @stable ICU 2.0
*/
UNORM_NO,
/**
* The input string is in the normalization form.
* @stable ICU 2.0
*/
UNORM_YES,
/**
* The input string may or may not be in the normalization form.
* This value is only returned for composition forms like NFC and FCC,
* when a backward-combining character is found for which the surrounding text
* would have to be analyzed further.
* @stable ICU 2.0
*/
UNORM_MAYBE
} UNormalizationCheckResult;
/**
* Opaque C service object type for the new normalization API.
* @stable ICU 4.4
*/
struct UNormalizer2;
typedef struct UNormalizer2 UNormalizer2; /**< C typedef for struct UNormalizer2. @stable ICU 4.4 */
#if !UCONFIG_NO_NORMALIZATION
/**
* Returns a UNormalizer2 instance for Unicode NFC normalization.
* Same as unorm2_getInstance(NULL, "nfc", UNORM2_COMPOSE, pErrorCode).
* Returns an unmodifiable singleton instance. Do not delete it.
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFCInstance(UErrorCode *pErrorCode);
/**
* Returns a UNormalizer2 instance for Unicode NFD normalization.
* Same as unorm2_getInstance(NULL, "nfc", UNORM2_DECOMPOSE, pErrorCode).
* Returns an unmodifiable singleton instance. Do not delete it.
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFDInstance(UErrorCode *pErrorCode);
/**
* Returns a UNormalizer2 instance for Unicode NFKC normalization.
* Same as unorm2_getInstance(NULL, "nfkc", UNORM2_COMPOSE, pErrorCode).
* Returns an unmodifiable singleton instance. Do not delete it.
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFKCInstance(UErrorCode *pErrorCode);
/**
* Returns a UNormalizer2 instance for Unicode NFKD normalization.
* Same as unorm2_getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, pErrorCode).
* Returns an unmodifiable singleton instance. Do not delete it.
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFKDInstance(UErrorCode *pErrorCode);
/**
* Returns a UNormalizer2 instance for Unicode toNFKC_Casefold() normalization
* which is equivalent to applying the NFKC_Casefold mappings and then NFC.
* See https://www.unicode.org/reports/tr44/#NFKC_Casefold
*
* Same as unorm2_getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, pErrorCode).
* Returns an unmodifiable singleton instance. Do not delete it.
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFKCCasefoldInstance(UErrorCode *pErrorCode);
/**
* Returns a UNormalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization
* which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC.
* See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold
*
* Same as unorm2_getInstance(NULL, "nfkc_scf", UNORM2_COMPOSE, pErrorCode).
* Returns an unmodifiable singleton instance. Do not delete it.
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested Normalizer2, if successful
* @stable ICU 74
*/
U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFKCSimpleCasefoldInstance(UErrorCode *pErrorCode);
/**
* Returns a UNormalizer2 instance which uses the specified data file
* (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
* and which composes or decomposes text according to the specified mode.
* Returns an unmodifiable singleton instance. Do not delete it.
*
* Use packageName=NULL for data files that are part of ICU's own data.
* Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
* Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
* Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
*
* @param packageName NULL for ICU built-in data, otherwise application data package name
* @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file
* @param mode normalization mode (compose or decompose etc.)
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested UNormalizer2, if successful
* @stable ICU 4.4
*/
U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getInstance(const char *packageName,
const char *name,
UNormalization2Mode mode,
UErrorCode *pErrorCode);
/**
* Constructs a filtered normalizer wrapping any UNormalizer2 instance
* and a filter set.
* Both are aliased and must not be modified or deleted while this object
* is used.
* The filter set should be frozen; otherwise the performance will suffer greatly.
* @param norm2 wrapped UNormalizer2 instance
* @param filterSet USet which determines the characters to be normalized
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested UNormalizer2, if successful
* @stable ICU 4.4
*/
U_CAPI UNormalizer2 * U_EXPORT2
unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode);
/**
* Closes a UNormalizer2 instance from unorm2_openFiltered().
* Do not close instances from unorm2_getInstance()!
* @param norm2 UNormalizer2 instance to be closed
* @stable ICU 4.4
*/
U_CAPI void U_EXPORT2
unorm2_close(UNormalizer2 *norm2);
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalUNormalizer2Pointer
* "Smart pointer" class, closes a UNormalizer2 via unorm2_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @stable ICU 4.4
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalUNormalizer2Pointer, UNormalizer2, unorm2_close);
U_NAMESPACE_END
#endif
/**
* Writes the normalized form of the source string to the destination string
* (replacing its contents) and returns the length of the destination string.
* The source and destination strings must be different buffers.
* @param norm2 UNormalizer2 instance
* @param src source string
* @param length length of the source string, or -1 if NUL-terminated
* @param dest destination string; its contents is replaced with normalized src
* @param capacity number of UChars that can be written to dest
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return dest
* @stable ICU 4.4
*/
U_CAPI int32_t U_EXPORT2
unorm2_normalize(const UNormalizer2 *norm2,
const UChar *src, int32_t length,
UChar *dest, int32_t capacity,
UErrorCode *pErrorCode);
/**
* Appends the normalized form of the second string to the first string
* (merging them at the boundary) and returns the length of the first string.
* The result is normalized if the first string was normalized.
* The first and second strings must be different buffers.
* @param norm2 UNormalizer2 instance
* @param first string, should be normalized
* @param firstLength length of the first string, or -1 if NUL-terminated
* @param firstCapacity number of UChars that can be written to first
* @param second string, will be normalized
* @param secondLength length of the source string, or -1 if NUL-terminated
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return first
* @stable ICU 4.4
*/
U_CAPI int32_t U_EXPORT2
unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
UChar *first, int32_t firstLength, int32_t firstCapacity,
const UChar *second, int32_t secondLength,
UErrorCode *pErrorCode);
/**
* Appends the second string to the first string
* (merging them at the boundary) and returns the length of the first string.
* The result is normalized if both the strings were normalized.
* The first and second strings must be different buffers.
* @param norm2 UNormalizer2 instance
* @param first string, should be normalized
* @param firstLength length of the first string, or -1 if NUL-terminated
* @param firstCapacity number of UChars that can be written to first
* @param second string, should be normalized
* @param secondLength length of the source string, or -1 if NUL-terminated
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return first
* @stable ICU 4.4
*/
U_CAPI int32_t U_EXPORT2
unorm2_append(const UNormalizer2 *norm2,
UChar *first, int32_t firstLength, int32_t firstCapacity,
const UChar *second, int32_t secondLength,
UErrorCode *pErrorCode);
/**
* Gets the decomposition mapping of c.
* Roughly equivalent to normalizing the String form of c
* on a UNORM2_DECOMPOSE UNormalizer2 instance, but much faster, and except that this function
* returns a negative value and does not write a string
* if c does not have a decomposition mapping in this instance's data.
* This function is independent of the mode of the UNormalizer2.
* @param norm2 UNormalizer2 instance
* @param c code point
* @param decomposition String buffer which will be set to c's
* decomposition mapping, if there is one.
* @param capacity number of UChars that can be written to decomposition
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the non-negative length of c's decomposition, if there is one; otherwise a negative value
* @stable ICU 4.6
*/
U_CAPI int32_t U_EXPORT2
unorm2_getDecomposition(const UNormalizer2 *norm2,
UChar32 c, UChar *decomposition, int32_t capacity,
UErrorCode *pErrorCode);
/**
* Gets the raw decomposition mapping of c.
*
* This is similar to the unorm2_getDecomposition() function but returns the
* raw decomposition mapping as specified in UnicodeData.txt or
* (for custom data) in the mapping files processed by the gennorm2 tool.
* By contrast, unorm2_getDecomposition() returns the processed,
* recursively-decomposed version of this mapping.
*
* When used on a standard NFKC Normalizer2 instance,
* unorm2_getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
*
* When used on a standard NFC Normalizer2 instance,
* it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
* in this case, the result contains either one or two code points (=1..4 UChars).
*
* This function is independent of the mode of the UNormalizer2.
* @param norm2 UNormalizer2 instance
* @param c code point
* @param decomposition String buffer which will be set to c's
* raw decomposition mapping, if there is one.
* @param capacity number of UChars that can be written to decomposition
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the non-negative length of c's raw decomposition, if there is one; otherwise a negative value
* @stable ICU 49
*/
U_CAPI int32_t U_EXPORT2
unorm2_getRawDecomposition(const UNormalizer2 *norm2,
UChar32 c, UChar *decomposition, int32_t capacity,
UErrorCode *pErrorCode);
/**
* Performs pairwise composition of a & b and returns the composite if there is one.
*
* Returns a composite code point c only if c has a two-way mapping to a+b.
* In standard Unicode normalization, this means that
* c has a canonical decomposition to a+b
* and c does not have the Full_Composition_Exclusion property.
*
* This function is independent of the mode of the UNormalizer2.
* @param norm2 UNormalizer2 instance
* @param a A (normalization starter) code point.
* @param b Another code point.
* @return The non-negative composite code point if there is one; otherwise a negative value.
* @stable ICU 49
*/
U_CAPI UChar32 U_EXPORT2
unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b);
/**
* Gets the combining class of c.
* The default implementation returns 0
* but all standard implementations return the Unicode Canonical_Combining_Class value.
* @param norm2 UNormalizer2 instance
* @param c code point
* @return c's combining class
* @stable ICU 49
*/
U_CAPI uint8_t U_EXPORT2
unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c);
/**
* Tests if the string is normalized.
* Internally, in cases where the quickCheck() method would return "maybe"
* (which is only possible for the two COMPOSE modes) this method
* resolves to "yes" or "no" to provide a definitive result,
* at the cost of doing more work in those cases.
* @param norm2 UNormalizer2 instance
* @param s input string
* @param length length of the string, or -1 if NUL-terminated
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return true if s is normalized
* @stable ICU 4.4
*/
U_CAPI UBool U_EXPORT2
unorm2_isNormalized(const UNormalizer2 *norm2,
const UChar *s, int32_t length,
UErrorCode *pErrorCode);
/**
* Tests if the string is normalized.
* For the two COMPOSE modes, the result could be "maybe" in cases that
* would take a little more work to resolve definitively.
* Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
* combination of quick check + normalization, to avoid
* re-checking the "yes" prefix.
* @param norm2 UNormalizer2 instance
* @param s input string
* @param length length of the string, or -1 if NUL-terminated
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return UNormalizationCheckResult
* @stable ICU 4.4
*/
U_CAPI UNormalizationCheckResult U_EXPORT2
unorm2_quickCheck(const UNormalizer2 *norm2,
const UChar *s, int32_t length,
UErrorCode *pErrorCode);
/**
* Returns the end of the normalized substring of the input string.
* In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
* the substring <code>UnicodeString(s, 0, end)</code>
* will pass the quick check with a "yes" result.
*
* The returned end index is usually one or more characters before the
* "no" or "maybe" character: The end index is at a normalization boundary.
* (See the class documentation for more about normalization boundaries.)
*
* When the goal is a normalized string and most input strings are expected
* to be normalized already, then call this method,
* and if it returns a prefix shorter than the input string,
* copy that prefix and use normalizeSecondAndAppend() for the remainder.
* @param norm2 UNormalizer2 instance
* @param s input string
* @param length length of the string, or -1 if NUL-terminated
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return "yes" span end index
* @stable ICU 4.4
*/
U_CAPI int32_t U_EXPORT2
unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
const UChar *s, int32_t length,
UErrorCode *pErrorCode);
/**
* Tests if the character always has a normalization boundary before it,
* regardless of context.
* For details see the Normalizer2 base class documentation.
* @param norm2 UNormalizer2 instance
* @param c character to test
* @return true if c has a normalization boundary before it
* @stable ICU 4.4
*/
U_CAPI UBool U_EXPORT2
unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c);
/**
* Tests if the character always has a normalization boundary after it,
* regardless of context.
* For details see the Normalizer2 base class documentation.
* @param norm2 UNormalizer2 instance
* @param c character to test
* @return true if c has a normalization boundary after it
* @stable ICU 4.4
*/
U_CAPI UBool U_EXPORT2
unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c);
/**
* Tests if the character is normalization-inert.
* For details see the Normalizer2 base class documentation.
* @param norm2 UNormalizer2 instance
* @param c character to test
* @return true if c is normalization-inert
* @stable ICU 4.4
*/
U_CAPI UBool U_EXPORT2
unorm2_isInert(const UNormalizer2 *norm2, UChar32 c);
/**
* Compares two strings for canonical equivalence.
* Further options include case-insensitive comparison and
* code point order (as opposed to code unit order).
*
* Canonical equivalence between two strings is defined as their normalized
* forms (NFD or NFC) being identical.
* This function compares strings incrementally instead of normalizing
* (and optionally case-folding) both strings entirely,
* improving performance significantly.
*
* Bulk normalization is only necessary if the strings do not fulfill the FCD
* conditions. Only in this case, and only if the strings are relatively long,
* is memory allocated temporarily.
* For FCD strings and short non-FCD strings there is no memory allocation.
*
* Semantically, this is equivalent to
* strcmp[CodePointOrder](NFD(foldCase(NFD(s1))), NFD(foldCase(NFD(s2))))
* where code point order and foldCase are all optional.
*
* UAX 21 2.5 Caseless Matching specifies that for a canonical caseless match
* the case folding must be performed first, then the normalization.
*
* @param s1 First source string.
* @param length1 Length of first source string, or -1 if NUL-terminated.
*
* @param s2 Second source string.
* @param length2 Length of second source string, or -1 if NUL-terminated.
*
* @param options A bit set of options:
* - U_FOLD_CASE_DEFAULT or 0 is used for default options:
* Case-sensitive comparison in code unit order, and the input strings
* are quick-checked for FCD.
*
* - UNORM_INPUT_IS_FCD
* Set if the caller knows that both s1 and s2 fulfill the FCD conditions.
* If not set, the function will quickCheck for FCD
* and normalize if necessary.
*
* - U_COMPARE_CODE_POINT_ORDER
* Set to choose code point order instead of code unit order
* (see u_strCompare for details).
*
* - U_COMPARE_IGNORE_CASE
* Set to compare strings case-insensitively using case folding,
* instead of case-sensitively.
* If set, then the following case folding options are used.
*
* - Options as used with case-insensitive comparisons, currently:
*
* - U_FOLD_CASE_EXCLUDE_SPECIAL_I
* (see u_strCaseCompare for details)
*
* - regular normalization options shifted left by UNORM_COMPARE_NORM_OPTIONS_SHIFT
*
* @param pErrorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
* @return <0 or 0 or >0 as usual for string comparisons
*
* @see unorm_normalize
* @see UNORM_FCD
* @see u_strCompare
* @see u_strCaseCompare
*
* @stable ICU 2.2
*/
U_CAPI int32_t U_EXPORT2
unorm_compare(const UChar *s1, int32_t length1,
const UChar *s2, int32_t length2,
uint32_t options,
UErrorCode *pErrorCode);
#endif /* !UCONFIG_NO_NORMALIZATION */
#endif /* __UNORM2_H__ */

View File

@@ -0,0 +1,324 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 2002-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: uobject.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2002jun26
* created by: Markus W. Scherer
*/
#ifndef __UOBJECT_H__
#define __UOBJECT_H__
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/platform.h"
/**
* \file
* \brief C++ API: Common ICU base class UObject.
*/
/**
* \def U_NO_THROW
* Since ICU 64, use noexcept instead.
*
* Previously, define this to define the throw() specification so
* certain functions do not throw any exceptions
*
* UMemory operator new methods should have the throw() specification
* appended to them, so that the compiler adds the additional nullptr check
* before calling constructors. Without, if <code>operator new</code> returns nullptr the
* constructor is still called, and if the constructor references member
* data, (which it typically does), the result is a segmentation violation.
*
* @stable ICU 4.2. Since ICU 64, Use noexcept instead. See ICU-20422.
*/
#ifndef U_NO_THROW
#define U_NO_THROW noexcept
#endif
/*===========================================================================*/
/* UClassID-based RTTI */
/*===========================================================================*/
/**
* UClassID is used to identify classes without using the compiler's RTTI.
* This was used before C++ compilers consistently supported RTTI.
* ICU 4.6 requires compiler RTTI to be turned on.
*
* Each class hierarchy which needs
* to implement polymorphic clone() or operator==() defines two methods,
* described in detail below. UClassID values can be compared using
* operator==(). Nothing else should be done with them.
*
* \par
* In class hierarchies that implement "poor man's RTTI",
* each concrete subclass implements getDynamicClassID() in the same way:
*
* \code
* class Derived {
* public:
* virtual UClassID getDynamicClassID() const
* { return Derived::getStaticClassID(); }
* }
* \endcode
*
* Each concrete class implements getStaticClassID() as well, which allows
* clients to test for a specific type.
*
* \code
* class Derived {
* public:
* static UClassID U_EXPORT2 getStaticClassID();
* private:
* static char fgClassID;
* }
*
* // In Derived.cpp:
* UClassID Derived::getStaticClassID()
* { return (UClassID)&Derived::fgClassID; }
* char Derived::fgClassID = 0; // Value is irrelevant
* \endcode
* @stable ICU 2.0
*/
typedef void* UClassID;
U_NAMESPACE_BEGIN
/**
* UMemory is the common ICU base class.
* All other ICU C++ classes are derived from UMemory (starting with ICU 2.4).
*
* This is primarily to make it possible and simple to override the
* C++ memory management by adding new/delete operators to this base class.
*
* To override ALL ICU memory management, including that from plain C code,
* replace the allocation functions declared in cmemory.h
*
* UMemory does not contain any virtual functions.
* Common "boilerplate" functions are defined in UObject.
*
* @stable ICU 2.4
*/
class U_COMMON_API UMemory {
public:
/* test versions for debugging shaper heap memory problems */
#ifdef SHAPER_MEMORY_DEBUG
static void * NewArray(int size, int count);
static void * GrowArray(void * array, int newSize );
static void FreeArray(void * array );
#endif
#if U_OVERRIDE_CXX_ALLOCATION
/**
* Override for ICU4C C++ memory management.
* simple, non-class types are allocated using the macros in common/cmemory.h
* (uprv_malloc(), uprv_free(), uprv_realloc());
* they or something else could be used here to implement C++ new/delete
* for ICU4C C++ classes
* @stable ICU 2.4
*/
static void * U_EXPORT2 operator new(size_t size) noexcept;
/**
* Override for ICU4C C++ memory management.
* See new().
* @stable ICU 2.4
*/
static void * U_EXPORT2 operator new[](size_t size) noexcept;
/**
* Override for ICU4C C++ memory management.
* simple, non-class types are allocated using the macros in common/cmemory.h
* (uprv_malloc(), uprv_free(), uprv_realloc());
* they or something else could be used here to implement C++ new/delete
* for ICU4C C++ classes
* @stable ICU 2.4
*/
static void U_EXPORT2 operator delete(void *p) noexcept;
/**
* Override for ICU4C C++ memory management.
* See delete().
* @stable ICU 2.4
*/
static void U_EXPORT2 operator delete[](void *p) noexcept;
#if U_HAVE_PLACEMENT_NEW
/**
* Override for ICU4C C++ memory management for STL.
* See new().
* @stable ICU 2.6
*/
static inline void * U_EXPORT2 operator new(size_t, void *ptr) noexcept { return ptr; }
/**
* Override for ICU4C C++ memory management for STL.
* See delete().
* @stable ICU 2.6
*/
static inline void U_EXPORT2 operator delete(void *, void *) noexcept {}
#endif /* U_HAVE_PLACEMENT_NEW */
#if U_HAVE_DEBUG_LOCATION_NEW
/**
* This method overrides the MFC debug version of the operator new
*
* @param size The requested memory size
* @param file The file where the allocation was requested
* @param line The line where the allocation was requested
*/
static void * U_EXPORT2 operator new(size_t size, const char* file, int line) noexcept;
/**
* This method provides a matching delete for the MFC debug new
*
* @param p The pointer to the allocated memory
* @param file The file where the allocation was requested
* @param line The line where the allocation was requested
*/
static void U_EXPORT2 operator delete(void* p, const char* file, int line) noexcept;
#endif /* U_HAVE_DEBUG_LOCATION_NEW */
#endif /* U_OVERRIDE_CXX_ALLOCATION */
/*
* Assignment operator not declared. The compiler will provide one
* which does nothing since this class does not contain any data members.
* API/code coverage may show the assignment operator as present and
* untested - ignore.
* Subclasses need this assignment operator if they use compiler-provided
* assignment operators of their own. An alternative to not declaring one
* here would be to declare and empty-implement a protected or public one.
UMemory &UMemory::operator=(const UMemory &);
*/
};
/**
* UObject is the common ICU "boilerplate" class.
* UObject inherits UMemory (starting with ICU 2.4),
* and all other public ICU C++ classes
* are derived from UObject (starting with ICU 2.2).
*
* UObject contains common virtual functions, in particular a virtual destructor.
*
* The clone() function is not available in UObject because it is not
* implemented by all ICU classes.
* Many ICU services provide a clone() function for their class trees,
* defined on the service's C++ base class
* (which itself is a subclass of UObject).
*
* @stable ICU 2.2
*/
class U_COMMON_API UObject : public UMemory {
public:
/**
* Destructor.
*
* @stable ICU 2.2
*/
virtual ~UObject();
/**
* ICU4C "poor man's RTTI", returns a UClassID for the actual ICU class.
* The base class implementation returns a dummy value.
*
* Use compiler RTTI rather than ICU's "poor man's RTTI".
* Since ICU 4.6, new ICU C++ class hierarchies do not implement "poor man's RTTI".
*
* @stable ICU 2.2
*/
virtual UClassID getDynamicClassID() const;
protected:
// the following functions are protected to prevent instantiation and
// direct use of UObject itself
// default constructor
// inline UObject() {}
// copy constructor
// inline UObject(const UObject &other) {}
#if 0
// TODO Sometime in the future. Implement operator==().
// (This comment inserted in 2.2)
// some or all of the following "boilerplate" functions may be made public
// in a future ICU4C release when all subclasses implement them
// assignment operator
// (not virtual, see "Taligent's Guide to Designing Programs" pp.73..74)
// commented out because the implementation is the same as a compiler's default
// UObject &operator=(const UObject &other) { return *this; }
// comparison operators
virtual inline bool operator==(const UObject &other) const { return this==&other; }
inline bool operator!=(const UObject &other) const { return !operator==(other); }
// clone() commented out from the base class:
// some compilers do not support co-variant return types
// (i.e., subclasses would have to return UObject * as well, instead of SubClass *)
// see also UObject class documentation.
// virtual UObject *clone() const;
#endif
/*
* Assignment operator not declared. The compiler will provide one
* which does nothing since this class does not contain any data members.
* API/code coverage may show the assignment operator as present and
* untested - ignore.
* Subclasses need this assignment operator if they use compiler-provided
* assignment operators of their own. An alternative to not declaring one
* here would be to declare and empty-implement a protected or public one.
UObject &UObject::operator=(const UObject &);
*/
};
#ifndef U_HIDE_INTERNAL_API
/**
* This is a simple macro to add ICU RTTI to an ICU object implementation.
* This does not go into the header. This should only be used in *.cpp files.
*
* @param myClass The name of the class that needs RTTI defined.
* @internal
*/
#define UOBJECT_DEFINE_RTTI_IMPLEMENTATION(myClass) \
UClassID U_EXPORT2 myClass::getStaticClassID() { \
static char classID = 0; \
return (UClassID)&classID; \
} \
UClassID myClass::getDynamicClassID() const \
{ return myClass::getStaticClassID(); }
/**
* This macro adds ICU RTTI to an ICU abstract class implementation.
* This macro should be invoked in *.cpp files. The corresponding
* header should declare getStaticClassID.
*
* @param myClass The name of the class that needs RTTI defined.
* @internal
*/
#define UOBJECT_DEFINE_ABSTRACT_RTTI_IMPLEMENTATION(myClass) \
UClassID U_EXPORT2 myClass::getStaticClassID() { \
static char classID = 0; \
return (UClassID)&classID; \
}
#endif /* U_HIDE_INTERNAL_API */
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif

2043
thirdparty/icu4c/common/unicode/urename.h vendored Normal file

File diff suppressed because it is too large Load Diff

157
thirdparty/icu4c/common/unicode/urep.h vendored Normal file
View File

@@ -0,0 +1,157 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
* Copyright (C) 1997-2010, International Business Machines
* Corporation and others. All Rights Reserved.
******************************************************************************
* Date Name Description
* 06/23/00 aliu Creation.
******************************************************************************
*/
#ifndef __UREP_H
#define __UREP_H
#include "unicode/utypes.h"
U_CDECL_BEGIN
/********************************************************************
* General Notes
********************************************************************
* TODO
* Add usage scenario
* Add test code
* Talk about pinning
* Talk about "can truncate result if out of memory"
*/
/********************************************************************
* Data Structures
********************************************************************/
/**
* \file
* \brief C API: Callbacks for UReplaceable
*/
/**
* An opaque replaceable text object. This will be manipulated only
* through the caller-supplied UReplaceableFunctor struct. Related
* to the C++ class Replaceable.
* This is currently only used in the Transliterator C API, see utrans.h .
* @stable ICU 2.0
*/
typedef void* UReplaceable;
/**
* A set of function pointers that transliterators use to manipulate a
* UReplaceable. The caller should supply the required functions to
* manipulate their text appropriately. Related to the C++ class
* Replaceable.
* @stable ICU 2.0
*/
typedef struct UReplaceableCallbacks {
/**
* Function pointer that returns the number of UChar code units in
* this text.
*
* @param rep A pointer to "this" UReplaceable object.
* @return The length of the text.
* @stable ICU 2.0
*/
int32_t (*length)(const UReplaceable* rep);
/**
* Function pointer that returns a UChar code units at the given
* offset into this text; 0 <= offset < n, where n is the value
* returned by (*length)(rep). See unistr.h for a description of
* charAt() vs. char32At().
*
* @param rep A pointer to "this" UReplaceable object.
* @param offset The index at which to fetch the UChar (code unit).
* @return The UChar (code unit) at offset, or U+FFFF if the offset is out of bounds.
* @stable ICU 2.0
*/
UChar (*charAt)(const UReplaceable* rep,
int32_t offset);
/**
* Function pointer that returns a UChar32 code point at the given
* offset into this text. See unistr.h for a description of
* charAt() vs. char32At().
*
* @param rep A pointer to "this" UReplaceable object.
* @param offset The index at which to fetch the UChar32 (code point).
* @return The UChar32 (code point) at offset, or U+FFFF if the offset is out of bounds.
* @stable ICU 2.0
*/
UChar32 (*char32At)(const UReplaceable* rep,
int32_t offset);
/**
* Function pointer that replaces text between start and limit in
* this text with the given text. Attributes (out of band info)
* should be retained.
*
* @param rep A pointer to "this" UReplaceable object.
* @param start the starting index of the text to be replaced,
* inclusive.
* @param limit the ending index of the text to be replaced,
* exclusive.
* @param text the new text to replace the UChars from
* start..limit-1.
* @param textLength the number of UChars at text, or -1 if text
* is null-terminated.
* @stable ICU 2.0
*/
void (*replace)(UReplaceable* rep,
int32_t start,
int32_t limit,
const UChar* text,
int32_t textLength);
/**
* Function pointer that copies the characters in the range
* [<tt>start</tt>, <tt>limit</tt>) into the array <tt>dst</tt>.
*
* @param rep A pointer to "this" UReplaceable object.
* @param start offset of first character which will be copied
* into the array
* @param limit offset immediately following the last character to
* be copied
* @param dst array in which to copy characters. The length of
* <tt>dst</tt> must be at least <tt>(limit - start)</tt>.
* @stable ICU 2.1
*/
void (*extract)(UReplaceable* rep,
int32_t start,
int32_t limit,
UChar* dst);
/**
* Function pointer that copies text between start and limit in
* this text to another index in the text. Attributes (out of
* band info) should be retained. After this call, there will be
* (at least) two copies of the characters originally located at
* start..limit-1.
*
* @param rep A pointer to "this" UReplaceable object.
* @param start the starting index of the text to be copied,
* inclusive.
* @param limit the ending index of the text to be copied,
* exclusive.
* @param dest the index at which the copy of the UChars should be
* inserted.
* @stable ICU 2.0
*/
void (*copy)(UReplaceable* rep,
int32_t start,
int32_t limit,
int32_t dest);
} UReplaceableCallbacks;
U_CDECL_END
#endif

912
thirdparty/icu4c/common/unicode/ures.h vendored Normal file
View File

@@ -0,0 +1,912 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 1997-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
* File URES.H (formerly CRESBUND.H)
*
* Modification History:
*
* Date Name Description
* 04/01/97 aliu Creation.
* 02/22/99 damiba overhaul.
* 04/04/99 helena Fixed internal header inclusion.
* 04/15/99 Madhu Updated Javadoc
* 06/14/99 stephen Removed functions taking a filename suffix.
* 07/20/99 stephen Language-independent typedef to void*
* 11/09/99 weiv Added ures_getLocale()
* 06/24/02 weiv Added support for resource sharing
******************************************************************************
*/
#ifndef URES_H
#define URES_H
#include "unicode/char16ptr.h"
#include "unicode/utypes.h"
#include "unicode/uloc.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/localpointer.h"
#endif // U_SHOW_CPLUSPLUS_API
/**
* \file
* \brief C API: Resource Bundle
*
* <h2>C API: Resource Bundle</h2>
*
* C API representing a collection of resource information pertaining to a given
* locale. A resource bundle provides a way of accessing locale- specific information in
* a data file. You create a resource bundle that manages the resources for a given
* locale and then ask it for individual resources.
* <P>
* Resource bundles in ICU4C are currently defined using text files which conform to the following
* <a href="https://github.com/unicode-org/icu-docs/blob/main/design/bnf_rb.txt">BNF definition</a>.
* More on resource bundle concepts and syntax can be found in the
* <a href="https://unicode-org.github.io/icu/userguide/locale/resources">Users Guide</a>.
* <P>
*/
/**
* UResourceBundle is an opaque type for handles for resource bundles in C APIs.
* @stable ICU 2.0
*/
struct UResourceBundle;
/**
* @stable ICU 2.0
*/
typedef struct UResourceBundle UResourceBundle;
/**
* Numeric constants for types of resource items.
* @see ures_getType
* @stable ICU 2.0
*/
typedef enum {
/** Resource type constant for "no resource". @stable ICU 2.6 */
URES_NONE=-1,
/** Resource type constant for 16-bit Unicode strings. @stable ICU 2.6 */
URES_STRING=0,
/** Resource type constant for binary data. @stable ICU 2.6 */
URES_BINARY=1,
/** Resource type constant for tables of key-value pairs. @stable ICU 2.6 */
URES_TABLE=2,
/**
* Resource type constant for aliases;
* internally stores a string which identifies the actual resource
* storing the data (can be in a different resource bundle).
* Resolved internally before delivering the actual resource through the API.
* @stable ICU 2.6
*/
URES_ALIAS=3,
/**
* Resource type constant for a single 28-bit integer, interpreted as
* signed or unsigned by the ures_getInt() or ures_getUInt() function.
* @see ures_getInt
* @see ures_getUInt
* @stable ICU 2.6
*/
URES_INT=7,
/** Resource type constant for arrays of resources. @stable ICU 2.6 */
URES_ARRAY=8,
/**
* Resource type constant for vectors of 32-bit integers.
* @see ures_getIntVector
* @stable ICU 2.6
*/
URES_INT_VECTOR = 14,
#ifndef U_HIDE_DEPRECATED_API
/** @deprecated ICU 2.6 Use the URES_ constant instead. */
RES_NONE=URES_NONE,
/** @deprecated ICU 2.6 Use the URES_ constant instead. */
RES_STRING=URES_STRING,
/** @deprecated ICU 2.6 Use the URES_ constant instead. */
RES_BINARY=URES_BINARY,
/** @deprecated ICU 2.6 Use the URES_ constant instead. */
RES_TABLE=URES_TABLE,
/** @deprecated ICU 2.6 Use the URES_ constant instead. */
RES_ALIAS=URES_ALIAS,
/** @deprecated ICU 2.6 Use the URES_ constant instead. */
RES_INT=URES_INT,
/** @deprecated ICU 2.6 Use the URES_ constant instead. */
RES_ARRAY=URES_ARRAY,
/** @deprecated ICU 2.6 Use the URES_ constant instead. */
RES_INT_VECTOR=URES_INT_VECTOR,
/** @deprecated ICU 2.6 Not used. */
RES_RESERVED=15,
/**
* One more than the highest normal UResType value.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
URES_LIMIT = 16
#endif // U_HIDE_DEPRECATED_API
} UResType;
/*
* Functions to create and destroy resource bundles.
*/
/**
* Opens a UResourceBundle, from which users can extract strings by using
* their corresponding keys.
* Note that the caller is responsible of calling <TT>ures_close</TT> on each successfully
* opened resource bundle.
* @param packageName The packageName and locale together point to an ICU udata object,
* as defined by <code> udata_open( packageName, "res", locale, err) </code>
* or equivalent. Typically, packageName will refer to a (.dat) file, or to
* a package registered with udata_setAppData(). Using a full file or directory
* pathname for packageName is deprecated. If NULL, ICU data will be used.
* @param locale specifies the locale for which we want to open the resource
* if NULL, the default locale will be used. If strlen(locale) == 0
* root locale will be used.
*
* @param status fills in the outgoing error code.
* The UErrorCode err parameter is used to return status information to the user. To
* check whether the construction succeeded or not, you should check the value of
* U_SUCCESS(err). If you wish more detailed information, you can check for
* informational status results which still indicate success. U_USING_FALLBACK_WARNING
* indicates that a fall back locale was used. For example, 'de_CH' was requested,
* but nothing was found there, so 'de' was used. U_USING_DEFAULT_WARNING indicates that
* the default locale data or root locale data was used; neither the requested locale
* nor any of its fall back locales could be found. Please see the users guide for more
* information on this topic.
* @return a newly allocated resource bundle.
* @see ures_close
* @stable ICU 2.0
*/
U_CAPI UResourceBundle* U_EXPORT2
ures_open(const char* packageName,
const char* locale,
UErrorCode* status);
/** This function does not care what kind of localeID is passed in. It simply opens a bundle with
* that name. Fallback mechanism is disabled for the new bundle. If the requested bundle contains
* an %%ALIAS directive, the results are undefined.
* @param packageName The packageName and locale together point to an ICU udata object,
* as defined by <code> udata_open( packageName, "res", locale, err) </code>
* or equivalent. Typically, packageName will refer to a (.dat) file, or to
* a package registered with udata_setAppData(). Using a full file or directory
* pathname for packageName is deprecated. If NULL, ICU data will be used.
* @param locale specifies the locale for which we want to open the resource
* if NULL, the default locale will be used. If strlen(locale) == 0
* root locale will be used.
*
* @param status fills in the outgoing error code. Either U_ZERO_ERROR or U_MISSING_RESOURCE_ERROR
* @return a newly allocated resource bundle or NULL if it doesn't exist.
* @see ures_close
* @stable ICU 2.0
*/
U_CAPI UResourceBundle* U_EXPORT2
ures_openDirect(const char* packageName,
const char* locale,
UErrorCode* status);
/**
* Same as ures_open() but takes a const UChar *path.
* This path will be converted to char * using the default converter,
* then ures_open() is called.
*
* @param packageName The packageName and locale together point to an ICU udata object,
* as defined by <code> udata_open( packageName, "res", locale, err) </code>
* or equivalent. Typically, packageName will refer to a (.dat) file, or to
* a package registered with udata_setAppData(). Using a full file or directory
* pathname for packageName is deprecated. If NULL, ICU data will be used.
* @param locale specifies the locale for which we want to open the resource
* if NULL, the default locale will be used. If strlen(locale) == 0
* root locale will be used.
* @param status fills in the outgoing error code.
* @return a newly allocated resource bundle.
* @see ures_open
* @stable ICU 2.0
*/
U_CAPI UResourceBundle* U_EXPORT2
ures_openU(const UChar* packageName,
const char* locale,
UErrorCode* status);
#ifndef U_HIDE_DEPRECATED_API
/**
* Returns the number of strings/arrays in resource bundles.
* Better to use ures_getSize, as this function will be deprecated.
*
*@param resourceBundle resource bundle containing the desired strings
*@param resourceKey key tagging the resource
*@param err fills in the outgoing error code
* could be <TT>U_MISSING_RESOURCE_ERROR</TT> if the key is not found
* could be a non-failing error
* e.g.: <TT>U_USING_FALLBACK_WARNING</TT>,<TT>U_USING_FALLBACK_WARNING </TT>
*@return: for <STRONG>Arrays</STRONG>: returns the number of resources in the array
* <STRONG>Tables</STRONG>: returns the number of resources in the table
* <STRONG>single string</STRONG>: returns 1
*@see ures_getSize
* @deprecated ICU 2.8 User ures_getSize instead
*/
U_DEPRECATED int32_t U_EXPORT2
ures_countArrayItems(const UResourceBundle* resourceBundle,
const char* resourceKey,
UErrorCode* err);
#endif /* U_HIDE_DEPRECATED_API */
/**
* Close a resource bundle, all pointers returned from the various ures_getXXX calls
* on this particular bundle should be considered invalid henceforth.
*
* @param resourceBundle a pointer to a resourceBundle struct. Can be NULL.
* @see ures_open
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2
ures_close(UResourceBundle* resourceBundle);
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalUResourceBundlePointer
* "Smart pointer" class, closes a UResourceBundle via ures_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @stable ICU 4.4
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalUResourceBundlePointer, UResourceBundle, ures_close);
U_NAMESPACE_END
#endif
#ifndef U_HIDE_DEPRECATED_API
/**
* Return the version number associated with this ResourceBundle as a string. Please
* use ures_getVersion as this function is going to be deprecated.
*
* @param resourceBundle The resource bundle for which the version is checked.
* @return A version number string as specified in the resource bundle or its parent.
* The caller does not own this string.
* @see ures_getVersion
* @deprecated ICU 2.8 Use ures_getVersion instead.
*/
U_DEPRECATED const char* U_EXPORT2
ures_getVersionNumber(const UResourceBundle* resourceBundle);
#endif /* U_HIDE_DEPRECATED_API */
/**
* Return the version number associated with this ResourceBundle as an
* UVersionInfo array.
*
* @param resB The resource bundle for which the version is checked.
* @param versionInfo A UVersionInfo array that is filled with the version number
* as specified in the resource bundle or its parent.
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2
ures_getVersion(const UResourceBundle* resB,
UVersionInfo versionInfo);
#ifndef U_HIDE_DEPRECATED_API
/**
* Return the name of the Locale associated with this ResourceBundle. This API allows
* you to query for the real locale of the resource. For example, if you requested
* "en_US_CALIFORNIA" and only "en_US" bundle exists, "en_US" will be returned.
* For subresources, the locale where this resource comes from will be returned.
* If fallback has occurred, getLocale will reflect this.
*
* @param resourceBundle resource bundle in question
* @param status just for catching illegal arguments
* @return A Locale name
* @deprecated ICU 2.8 Use ures_getLocaleByType instead.
*/
U_DEPRECATED const char* U_EXPORT2
ures_getLocale(const UResourceBundle* resourceBundle,
UErrorCode* status);
#endif /* U_HIDE_DEPRECATED_API */
/**
* Return the name of the Locale associated with this ResourceBundle.
* You can choose between requested, valid and real locale.
*
* @param resourceBundle resource bundle in question
* @param type You can choose between requested, valid and actual
* locale. For description see the definition of
* ULocDataLocaleType in uloc.h
* @param status just for catching illegal arguments
* @return A Locale name
* @stable ICU 2.8
*/
U_CAPI const char* U_EXPORT2
ures_getLocaleByType(const UResourceBundle* resourceBundle,
ULocDataLocaleType type,
UErrorCode* status);
#ifndef U_HIDE_INTERNAL_API
/**
* Same as ures_open() but uses the fill-in parameter instead of allocating a new bundle.
*
* TODO need to revisit usefulness of this function
* and usage model for fillIn parameters without knowing sizeof(UResourceBundle)
* @param r The existing UResourceBundle to fill in. If NULL then status will be
* set to U_ILLEGAL_ARGUMENT_ERROR.
* @param packageName The packageName and locale together point to an ICU udata object,
* as defined by <code> udata_open( packageName, "res", locale, err) </code>
* or equivalent. Typically, packageName will refer to a (.dat) file, or to
* a package registered with udata_setAppData(). Using a full file or directory
* pathname for packageName is deprecated. If NULL, ICU data will be used.
* @param localeID specifies the locale for which we want to open the resource
* @param status The error code.
* @internal
*/
U_CAPI void U_EXPORT2
ures_openFillIn(UResourceBundle *r,
const char* packageName,
const char* localeID,
UErrorCode* status);
#endif /* U_HIDE_INTERNAL_API */
/**
* Returns a string from a string resource type
*
* @param resourceBundle a string resource
* @param len fills in the length of resulting string
* @param status fills in the outgoing error code
* could be <TT>U_MISSING_RESOURCE_ERROR</TT> if the key is not found
* Always check the value of status. Don't count on returning NULL.
* could be a non-failing error
* e.g.: <TT>U_USING_FALLBACK_WARNING</TT>,<TT>U_USING_DEFAULT_WARNING </TT>
* @return a pointer to a zero-terminated UChar array which lives in a memory mapped/DLL file.
* @see ures_getBinary
* @see ures_getIntVector
* @see ures_getInt
* @see ures_getUInt
* @stable ICU 2.0
*/
U_CAPI const UChar* U_EXPORT2
ures_getString(const UResourceBundle* resourceBundle,
int32_t* len,
UErrorCode* status);
/**
* Returns a UTF-8 string from a string resource.
* The UTF-8 string may be returnable directly as a pointer, or
* it may need to be copied, or transformed from UTF-16 using u_strToUTF8()
* or equivalent.
*
* If forceCopy==true, then the string is always written to the dest buffer
* and dest is returned.
*
* If forceCopy==false, then the string is returned as a pointer if possible,
* without needing a dest buffer (it can be NULL). If the string needs to be
* copied or transformed, then it may be placed into dest at an arbitrary offset.
*
* If the string is to be written to dest, then U_BUFFER_OVERFLOW_ERROR and
* U_STRING_NOT_TERMINATED_WARNING are set if appropriate, as usual.
*
* If the string is transformed from UTF-16, then a conversion error may occur
* if an unpaired surrogate is encountered. If the function is successful, then
* the output UTF-8 string is always well-formed.
*
* @param resB Resource bundle.
* @param dest Destination buffer. Can be NULL only if capacity=*length==0.
* @param length Input: Capacity of destination buffer.
* Output: Actual length of the UTF-8 string, not counting the
* terminating NUL, even in case of U_BUFFER_OVERFLOW_ERROR.
* Can be NULL, meaning capacity=0 and the string length is not
* returned to the caller.
* @param forceCopy If true, then the output string will always be written to
* dest, with U_BUFFER_OVERFLOW_ERROR and
* U_STRING_NOT_TERMINATED_WARNING set if appropriate.
* If false, then the dest buffer may or may not contain a
* copy of the string. dest may or may not be modified.
* If a copy needs to be written, then the UErrorCode parameter
* indicates overflow etc. as usual.
* @param status Pointer to a standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return The pointer to the UTF-8 string. It may be dest, or at some offset
* from dest (only if !forceCopy), or in unrelated memory.
* Always NUL-terminated unless the string was written to dest and
* length==capacity (in which case U_STRING_NOT_TERMINATED_WARNING is set).
*
* @see ures_getString
* @see u_strToUTF8
* @stable ICU 3.6
*/
U_CAPI const char * U_EXPORT2
ures_getUTF8String(const UResourceBundle *resB,
char *dest, int32_t *length,
UBool forceCopy,
UErrorCode *status);
/**
* Returns a binary data from a binary resource.
*
* @param resourceBundle a string resource
* @param len fills in the length of resulting byte chunk
* @param status fills in the outgoing error code
* could be <TT>U_MISSING_RESOURCE_ERROR</TT> if the key is not found
* Always check the value of status. Don't count on returning NULL.
* could be a non-failing error
* e.g.: <TT>U_USING_FALLBACK_WARNING</TT>,<TT>U_USING_DEFAULT_WARNING </TT>
* @return a pointer to a chunk of unsigned bytes which live in a memory mapped/DLL file.
* @see ures_getString
* @see ures_getIntVector
* @see ures_getInt
* @see ures_getUInt
* @stable ICU 2.0
*/
U_CAPI const uint8_t* U_EXPORT2
ures_getBinary(const UResourceBundle* resourceBundle,
int32_t* len,
UErrorCode* status);
/**
* Returns a 32 bit integer array from a resource.
*
* @param resourceBundle an int vector resource
* @param len fills in the length of resulting byte chunk
* @param status fills in the outgoing error code
* could be <TT>U_MISSING_RESOURCE_ERROR</TT> if the key is not found
* Always check the value of status. Don't count on returning NULL.
* could be a non-failing error
* e.g.: <TT>U_USING_FALLBACK_WARNING</TT>,<TT>U_USING_DEFAULT_WARNING </TT>
* @return a pointer to a chunk of integers which live in a memory mapped/DLL file.
* @see ures_getBinary
* @see ures_getString
* @see ures_getInt
* @see ures_getUInt
* @stable ICU 2.0
*/
U_CAPI const int32_t* U_EXPORT2
ures_getIntVector(const UResourceBundle* resourceBundle,
int32_t* len,
UErrorCode* status);
/**
* Returns an unsigned integer from a resource.
* This integer is originally 28 bits.
*
* @param resourceBundle a string resource
* @param status fills in the outgoing error code
* could be <TT>U_MISSING_RESOURCE_ERROR</TT> if the key is not found
* could be a non-failing error
* e.g.: <TT>U_USING_FALLBACK_WARNING</TT>,<TT>U_USING_DEFAULT_WARNING </TT>
* @return an integer value
* @see ures_getInt
* @see ures_getIntVector
* @see ures_getBinary
* @see ures_getString
* @stable ICU 2.0
*/
U_CAPI uint32_t U_EXPORT2
ures_getUInt(const UResourceBundle* resourceBundle,
UErrorCode *status);
/**
* Returns a signed integer from a resource.
* This integer is originally 28 bit and the sign gets propagated.
*
* @param resourceBundle a string resource
* @param status fills in the outgoing error code
* could be <TT>U_MISSING_RESOURCE_ERROR</TT> if the key is not found
* could be a non-failing error
* e.g.: <TT>U_USING_FALLBACK_WARNING</TT>,<TT>U_USING_DEFAULT_WARNING </TT>
* @return an integer value
* @see ures_getUInt
* @see ures_getIntVector
* @see ures_getBinary
* @see ures_getString
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ures_getInt(const UResourceBundle* resourceBundle,
UErrorCode *status);
/**
* Returns the size of a resource. Size for scalar types is always 1,
* and for vector/table types is the number of child resources.
* @warning Integer array is treated as a scalar type. There are no
* APIs to access individual members of an integer array. It
* is always returned as a whole.
* @param resourceBundle a resource
* @return number of resources in a given resource.
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
ures_getSize(const UResourceBundle *resourceBundle);
/**
* Returns the type of a resource. Available types are defined in enum UResType
*
* @param resourceBundle a resource
* @return type of the given resource.
* @see UResType
* @stable ICU 2.0
*/
U_CAPI UResType U_EXPORT2
ures_getType(const UResourceBundle *resourceBundle);
/**
* Returns the key associated with a given resource. Not all the resources have a key - only
* those that are members of a table.
*
* @param resourceBundle a resource
* @return a key associated to this resource, or NULL if it doesn't have a key
* @stable ICU 2.0
*/
U_CAPI const char * U_EXPORT2
ures_getKey(const UResourceBundle *resourceBundle);
/* ITERATION API
This API provides means for iterating through a resource
*/
/**
* Resets the internal context of a resource so that iteration starts from the first element.
*
* @param resourceBundle a resource
* @stable ICU 2.0
*/
U_CAPI void U_EXPORT2
ures_resetIterator(UResourceBundle *resourceBundle);
/**
* Checks whether the given resource has another element to iterate over.
*
* @param resourceBundle a resource
* @return true if there are more elements, false if there is no more elements
* @stable ICU 2.0
*/
U_CAPI UBool U_EXPORT2
ures_hasNext(const UResourceBundle *resourceBundle);
/**
* Returns the next resource in a given resource or NULL if there are no more resources
* to iterate over. Features a fill-in parameter.
*
* @param resourceBundle a resource
* @param fillIn if NULL a new UResourceBundle struct is allocated and must be closed by the caller.
* Alternatively, you can supply a struct to be filled by this function.
* @param status fills in the outgoing error code. You may still get a non NULL result even if an
* error occurred. Check status instead.
* @return a pointer to a UResourceBundle struct. If fill in param was NULL, caller must close it
* @stable ICU 2.0
*/
U_CAPI UResourceBundle* U_EXPORT2
ures_getNextResource(UResourceBundle *resourceBundle,
UResourceBundle *fillIn,
UErrorCode *status);
/**
* Returns the next string in a given resource or NULL if there are no more resources
* to iterate over.
*
* @param resourceBundle a resource
* @param len fill in length of the string
* @param key fill in for key associated with this string. NULL if no key
* @param status fills in the outgoing error code. If an error occurred, we may return NULL, but don't
* count on it. Check status instead!
* @return a pointer to a zero-terminated UChar array which lives in a memory mapped/DLL file.
* @stable ICU 2.0
*/
U_CAPI const UChar* U_EXPORT2
ures_getNextString(UResourceBundle *resourceBundle,
int32_t* len,
const char ** key,
UErrorCode *status);
/**
* Returns the resource in a given resource at the specified index. Features a fill-in parameter.
*
* @param resourceBundle the resource bundle from which to get a sub-resource
* @param indexR an index to the wanted resource.
* @param fillIn if NULL a new UResourceBundle struct is allocated and must be closed by the caller.
* Alternatively, you can supply a struct to be filled by this function.
* @param status fills in the outgoing error code. Don't count on NULL being returned if an error has
* occurred. Check status instead.
* @return a pointer to a UResourceBundle struct. If fill in param was NULL, caller must close it
* @stable ICU 2.0
*/
U_CAPI UResourceBundle* U_EXPORT2
ures_getByIndex(const UResourceBundle *resourceBundle,
int32_t indexR,
UResourceBundle *fillIn,
UErrorCode *status);
/**
* Returns the string in a given resource at the specified index.
*
* @param resourceBundle a resource
* @param indexS an index to the wanted string.
* @param len fill in length of the string
* @param status fills in the outgoing error code. If an error occurred, we may return NULL, but don't
* count on it. Check status instead!
* @return a pointer to a zero-terminated UChar array which lives in a memory mapped/DLL file.
* @stable ICU 2.0
*/
U_CAPI const UChar* U_EXPORT2
ures_getStringByIndex(const UResourceBundle *resourceBundle,
int32_t indexS,
int32_t* len,
UErrorCode *status);
/**
* Returns a UTF-8 string from a resource at the specified index.
* The UTF-8 string may be returnable directly as a pointer, or
* it may need to be copied, or transformed from UTF-16 using u_strToUTF8()
* or equivalent.
*
* If forceCopy==true, then the string is always written to the dest buffer
* and dest is returned.
*
* If forceCopy==false, then the string is returned as a pointer if possible,
* without needing a dest buffer (it can be NULL). If the string needs to be
* copied or transformed, then it may be placed into dest at an arbitrary offset.
*
* If the string is to be written to dest, then U_BUFFER_OVERFLOW_ERROR and
* U_STRING_NOT_TERMINATED_WARNING are set if appropriate, as usual.
*
* If the string is transformed from UTF-16, then a conversion error may occur
* if an unpaired surrogate is encountered. If the function is successful, then
* the output UTF-8 string is always well-formed.
*
* @param resB Resource bundle.
* @param stringIndex An index to the wanted string.
* @param dest Destination buffer. Can be NULL only if capacity=*length==0.
* @param pLength Input: Capacity of destination buffer.
* Output: Actual length of the UTF-8 string, not counting the
* terminating NUL, even in case of U_BUFFER_OVERFLOW_ERROR.
* Can be NULL, meaning capacity=0 and the string length is not
* returned to the caller.
* @param forceCopy If true, then the output string will always be written to
* dest, with U_BUFFER_OVERFLOW_ERROR and
* U_STRING_NOT_TERMINATED_WARNING set if appropriate.
* If false, then the dest buffer may or may not contain a
* copy of the string. dest may or may not be modified.
* If a copy needs to be written, then the UErrorCode parameter
* indicates overflow etc. as usual.
* @param status Pointer to a standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return The pointer to the UTF-8 string. It may be dest, or at some offset
* from dest (only if !forceCopy), or in unrelated memory.
* Always NUL-terminated unless the string was written to dest and
* length==capacity (in which case U_STRING_NOT_TERMINATED_WARNING is set).
*
* @see ures_getStringByIndex
* @see u_strToUTF8
* @stable ICU 3.6
*/
U_CAPI const char * U_EXPORT2
ures_getUTF8StringByIndex(const UResourceBundle *resB,
int32_t stringIndex,
char *dest, int32_t *pLength,
UBool forceCopy,
UErrorCode *status);
/**
* Returns a resource in a given resource that has a given key. This procedure works only with table
* resources. Features a fill-in parameter.
*
* @param resourceBundle a resource
* @param key a key associated with the wanted resource
* @param fillIn if NULL a new UResourceBundle struct is allocated and must be closed by the caller.
* Alternatively, you can supply a struct to be filled by this function.
* @param status fills in the outgoing error code.
* @return a pointer to a UResourceBundle struct. If fill in param was NULL, caller must close it
* @stable ICU 2.0
*/
U_CAPI UResourceBundle* U_EXPORT2
ures_getByKey(const UResourceBundle *resourceBundle,
const char* key,
UResourceBundle *fillIn,
UErrorCode *status);
/**
* Returns a string in a given resource that has a given key. This procedure works only with table
* resources.
*
* @param resB a resource
* @param key a key associated with the wanted string
* @param len fill in length of the string
* @param status fills in the outgoing error code. If an error occurred, we may return NULL, but don't
* count on it. Check status instead!
* @return a pointer to a zero-terminated UChar array which lives in a memory mapped/DLL file.
* @stable ICU 2.0
*/
U_CAPI const UChar* U_EXPORT2
ures_getStringByKey(const UResourceBundle *resB,
const char* key,
int32_t* len,
UErrorCode *status);
/**
* Returns a UTF-8 string from a resource and a key.
* This function works only with table resources.
*
* The UTF-8 string may be returnable directly as a pointer, or
* it may need to be copied, or transformed from UTF-16 using u_strToUTF8()
* or equivalent.
*
* If forceCopy==true, then the string is always written to the dest buffer
* and dest is returned.
*
* If forceCopy==false, then the string is returned as a pointer if possible,
* without needing a dest buffer (it can be NULL). If the string needs to be
* copied or transformed, then it may be placed into dest at an arbitrary offset.
*
* If the string is to be written to dest, then U_BUFFER_OVERFLOW_ERROR and
* U_STRING_NOT_TERMINATED_WARNING are set if appropriate, as usual.
*
* If the string is transformed from UTF-16, then a conversion error may occur
* if an unpaired surrogate is encountered. If the function is successful, then
* the output UTF-8 string is always well-formed.
*
* @param resB Resource bundle.
* @param key A key associated with the wanted resource
* @param dest Destination buffer. Can be NULL only if capacity=*length==0.
* @param pLength Input: Capacity of destination buffer.
* Output: Actual length of the UTF-8 string, not counting the
* terminating NUL, even in case of U_BUFFER_OVERFLOW_ERROR.
* Can be NULL, meaning capacity=0 and the string length is not
* returned to the caller.
* @param forceCopy If true, then the output string will always be written to
* dest, with U_BUFFER_OVERFLOW_ERROR and
* U_STRING_NOT_TERMINATED_WARNING set if appropriate.
* If false, then the dest buffer may or may not contain a
* copy of the string. dest may or may not be modified.
* If a copy needs to be written, then the UErrorCode parameter
* indicates overflow etc. as usual.
* @param status Pointer to a standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return The pointer to the UTF-8 string. It may be dest, or at some offset
* from dest (only if !forceCopy), or in unrelated memory.
* Always NUL-terminated unless the string was written to dest and
* length==capacity (in which case U_STRING_NOT_TERMINATED_WARNING is set).
*
* @see ures_getStringByKey
* @see u_strToUTF8
* @stable ICU 3.6
*/
U_CAPI const char * U_EXPORT2
ures_getUTF8StringByKey(const UResourceBundle *resB,
const char *key,
char *dest, int32_t *pLength,
UBool forceCopy,
UErrorCode *status);
#if U_SHOW_CPLUSPLUS_API
#include "unicode/unistr.h"
U_NAMESPACE_BEGIN
/**
* Returns the string value from a string resource bundle.
*
* @param resB a resource, should have type URES_STRING
* @param status: fills in the outgoing error code
* could be <TT>U_MISSING_RESOURCE_ERROR</TT> if the key is not found
* could be a non-failing error
* e.g.: <TT>U_USING_FALLBACK_WARNING</TT>,<TT>U_USING_DEFAULT_WARNING </TT>
* @return The string value, or a bogus string if there is a failure UErrorCode.
* @stable ICU 2.0
*/
inline UnicodeString
ures_getUnicodeString(const UResourceBundle *resB, UErrorCode* status) {
UnicodeString result;
int32_t len = 0;
const char16_t *r = ConstChar16Ptr(ures_getString(resB, &len, status));
if(U_SUCCESS(*status)) {
result.setTo(true, r, len);
} else {
result.setToBogus();
}
return result;
}
/**
* Returns the next string in a resource, or an empty string if there are no more resources
* to iterate over.
* Use ures_getNextString() instead to distinguish between
* the end of the iteration and a real empty string value.
*
* @param resB a resource
* @param key fill in for key associated with this string
* @param status fills in the outgoing error code
* @return The string value, or a bogus string if there is a failure UErrorCode.
* @stable ICU 2.0
*/
inline UnicodeString
ures_getNextUnicodeString(UResourceBundle *resB, const char ** key, UErrorCode* status) {
UnicodeString result;
int32_t len = 0;
const char16_t* r = ConstChar16Ptr(ures_getNextString(resB, &len, key, status));
if(U_SUCCESS(*status)) {
result.setTo(true, r, len);
} else {
result.setToBogus();
}
return result;
}
/**
* Returns the string in a given resource array or table at the specified index.
*
* @param resB a resource
* @param indexS an index to the wanted string.
* @param status fills in the outgoing error code
* @return The string value, or a bogus string if there is a failure UErrorCode.
* @stable ICU 2.0
*/
inline UnicodeString
ures_getUnicodeStringByIndex(const UResourceBundle *resB, int32_t indexS, UErrorCode* status) {
UnicodeString result;
int32_t len = 0;
const char16_t* r = ConstChar16Ptr(ures_getStringByIndex(resB, indexS, &len, status));
if(U_SUCCESS(*status)) {
result.setTo(true, r, len);
} else {
result.setToBogus();
}
return result;
}
/**
* Returns a string in a resource that has a given key.
* This procedure works only with table resources.
*
* @param resB a resource
* @param key a key associated with the wanted string
* @param status fills in the outgoing error code
* @return The string value, or a bogus string if there is a failure UErrorCode.
* @stable ICU 2.0
*/
inline UnicodeString
ures_getUnicodeStringByKey(const UResourceBundle *resB, const char* key, UErrorCode* status) {
UnicodeString result;
int32_t len = 0;
const char16_t* r = ConstChar16Ptr(ures_getStringByKey(resB, key, &len, status));
if(U_SUCCESS(*status)) {
result.setTo(true, r, len);
} else {
result.setToBogus();
}
return result;
}
U_NAMESPACE_END
#endif
/**
* Create a string enumerator, owned by the caller, of all locales located within
* the specified resource tree.
* @param packageName name of the tree, such as (NULL) or U_ICUDATA_ALIAS or or "ICUDATA-coll"
* This call is similar to uloc_getAvailable().
* @param status error code
* @stable ICU 3.2
*/
U_CAPI UEnumeration* U_EXPORT2
ures_openAvailableLocales(const char *packageName, UErrorCode *status);
#endif /*_URES*/
/*eof*/

View File

@@ -0,0 +1,742 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 1997-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
* File USCRIPT.H
*
* Modification History:
*
* Date Name Description
* 07/06/2001 Ram Creation.
******************************************************************************
*/
#ifndef USCRIPT_H
#define USCRIPT_H
#include "unicode/utypes.h"
/**
* \file
* \brief C API: Unicode Script Information
*/
/**
* Constants for ISO 15924 script codes.
*
* The current set of script code constants supports at least all scripts
* that are encoded in the version of Unicode which ICU currently supports.
* The names of the constants are usually derived from the
* Unicode script property value aliases.
* See UAX #24 Unicode Script Property (http://www.unicode.org/reports/tr24/)
* and http://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt .
*
* In addition, constants for many ISO 15924 script codes
* are included, for use with language tags, CLDR data, and similar.
* Some of those codes are not used in the Unicode Character Database (UCD).
* For example, there are no characters that have a UCD script property value of
* Hans or Hant. All Han ideographs have the Hani script property value in Unicode.
*
* Private-use codes Qaaa..Qabx are not included, except as used in the UCD or in CLDR.
*
* Starting with ICU 55, script codes are only added when their scripts
* have been or will certainly be encoded in Unicode,
* and have been assigned Unicode script property value aliases,
* to ensure that their script names are stable and match the names of the constants.
* Script codes like Latf and Aran that are not subject to separate encoding
* may be added at any time.
*
* @stable ICU 2.2
*/
typedef enum UScriptCode {
/*
* Note: UScriptCode constants and their ISO script code comments
* are parsed by preparseucd.py.
* It matches lines like
* USCRIPT_<Unicode Script value name> = <integer>, / * <ISO script code> * /
*/
/** @stable ICU 2.2 */
USCRIPT_INVALID_CODE = -1,
/** @stable ICU 2.2 */
USCRIPT_COMMON = 0, /* Zyyy */
/** @stable ICU 2.2 */
USCRIPT_INHERITED = 1, /* Zinh */ /* "Code for inherited script", for non-spacing combining marks; also Qaai */
/** @stable ICU 2.2 */
USCRIPT_ARABIC = 2, /* Arab */
/** @stable ICU 2.2 */
USCRIPT_ARMENIAN = 3, /* Armn */
/** @stable ICU 2.2 */
USCRIPT_BENGALI = 4, /* Beng */
/** @stable ICU 2.2 */
USCRIPT_BOPOMOFO = 5, /* Bopo */
/** @stable ICU 2.2 */
USCRIPT_CHEROKEE = 6, /* Cher */
/** @stable ICU 2.2 */
USCRIPT_COPTIC = 7, /* Copt */
/** @stable ICU 2.2 */
USCRIPT_CYRILLIC = 8, /* Cyrl */
/** @stable ICU 2.2 */
USCRIPT_DESERET = 9, /* Dsrt */
/** @stable ICU 2.2 */
USCRIPT_DEVANAGARI = 10, /* Deva */
/** @stable ICU 2.2 */
USCRIPT_ETHIOPIC = 11, /* Ethi */
/** @stable ICU 2.2 */
USCRIPT_GEORGIAN = 12, /* Geor */
/** @stable ICU 2.2 */
USCRIPT_GOTHIC = 13, /* Goth */
/** @stable ICU 2.2 */
USCRIPT_GREEK = 14, /* Grek */
/** @stable ICU 2.2 */
USCRIPT_GUJARATI = 15, /* Gujr */
/** @stable ICU 2.2 */
USCRIPT_GURMUKHI = 16, /* Guru */
/** @stable ICU 2.2 */
USCRIPT_HAN = 17, /* Hani */
/** @stable ICU 2.2 */
USCRIPT_HANGUL = 18, /* Hang */
/** @stable ICU 2.2 */
USCRIPT_HEBREW = 19, /* Hebr */
/** @stable ICU 2.2 */
USCRIPT_HIRAGANA = 20, /* Hira */
/** @stable ICU 2.2 */
USCRIPT_KANNADA = 21, /* Knda */
/** @stable ICU 2.2 */
USCRIPT_KATAKANA = 22, /* Kana */
/** @stable ICU 2.2 */
USCRIPT_KHMER = 23, /* Khmr */
/** @stable ICU 2.2 */
USCRIPT_LAO = 24, /* Laoo */
/** @stable ICU 2.2 */
USCRIPT_LATIN = 25, /* Latn */
/** @stable ICU 2.2 */
USCRIPT_MALAYALAM = 26, /* Mlym */
/** @stable ICU 2.2 */
USCRIPT_MONGOLIAN = 27, /* Mong */
/** @stable ICU 2.2 */
USCRIPT_MYANMAR = 28, /* Mymr */
/** @stable ICU 2.2 */
USCRIPT_OGHAM = 29, /* Ogam */
/** @stable ICU 2.2 */
USCRIPT_OLD_ITALIC = 30, /* Ital */
/** @stable ICU 2.2 */
USCRIPT_ORIYA = 31, /* Orya */
/** @stable ICU 2.2 */
USCRIPT_RUNIC = 32, /* Runr */
/** @stable ICU 2.2 */
USCRIPT_SINHALA = 33, /* Sinh */
/** @stable ICU 2.2 */
USCRIPT_SYRIAC = 34, /* Syrc */
/** @stable ICU 2.2 */
USCRIPT_TAMIL = 35, /* Taml */
/** @stable ICU 2.2 */
USCRIPT_TELUGU = 36, /* Telu */
/** @stable ICU 2.2 */
USCRIPT_THAANA = 37, /* Thaa */
/** @stable ICU 2.2 */
USCRIPT_THAI = 38, /* Thai */
/** @stable ICU 2.2 */
USCRIPT_TIBETAN = 39, /* Tibt */
/** Canadian_Aboriginal script. @stable ICU 2.6 */
USCRIPT_CANADIAN_ABORIGINAL = 40, /* Cans */
/** Canadian_Aboriginal script (alias). @stable ICU 2.2 */
USCRIPT_UCAS = USCRIPT_CANADIAN_ABORIGINAL,
/** @stable ICU 2.2 */
USCRIPT_YI = 41, /* Yiii */
/* New scripts in Unicode 3.2 */
/** @stable ICU 2.2 */
USCRIPT_TAGALOG = 42, /* Tglg */
/** @stable ICU 2.2 */
USCRIPT_HANUNOO = 43, /* Hano */
/** @stable ICU 2.2 */
USCRIPT_BUHID = 44, /* Buhd */
/** @stable ICU 2.2 */
USCRIPT_TAGBANWA = 45, /* Tagb */
/* New scripts in Unicode 4 */
/** @stable ICU 2.6 */
USCRIPT_BRAILLE = 46, /* Brai */
/** @stable ICU 2.6 */
USCRIPT_CYPRIOT = 47, /* Cprt */
/** @stable ICU 2.6 */
USCRIPT_LIMBU = 48, /* Limb */
/** @stable ICU 2.6 */
USCRIPT_LINEAR_B = 49, /* Linb */
/** @stable ICU 2.6 */
USCRIPT_OSMANYA = 50, /* Osma */
/** @stable ICU 2.6 */
USCRIPT_SHAVIAN = 51, /* Shaw */
/** @stable ICU 2.6 */
USCRIPT_TAI_LE = 52, /* Tale */
/** @stable ICU 2.6 */
USCRIPT_UGARITIC = 53, /* Ugar */
/** New script code in Unicode 4.0.1 @stable ICU 3.0 */
USCRIPT_KATAKANA_OR_HIRAGANA = 54,/*Hrkt */
/* New scripts in Unicode 4.1 */
/** @stable ICU 3.4 */
USCRIPT_BUGINESE = 55, /* Bugi */
/** @stable ICU 3.4 */
USCRIPT_GLAGOLITIC = 56, /* Glag */
/** @stable ICU 3.4 */
USCRIPT_KHAROSHTHI = 57, /* Khar */
/** @stable ICU 3.4 */
USCRIPT_SYLOTI_NAGRI = 58, /* Sylo */
/** @stable ICU 3.4 */
USCRIPT_NEW_TAI_LUE = 59, /* Talu */
/** @stable ICU 3.4 */
USCRIPT_TIFINAGH = 60, /* Tfng */
/** @stable ICU 3.4 */
USCRIPT_OLD_PERSIAN = 61, /* Xpeo */
/* New script codes from Unicode and ISO 15924 */
/** @stable ICU 3.6 */
USCRIPT_BALINESE = 62, /* Bali */
/** @stable ICU 3.6 */
USCRIPT_BATAK = 63, /* Batk */
/** @stable ICU 3.6 */
USCRIPT_BLISSYMBOLS = 64, /* Blis */
/** @stable ICU 3.6 */
USCRIPT_BRAHMI = 65, /* Brah */
/** @stable ICU 3.6 */
USCRIPT_CHAM = 66, /* Cham */
/** @stable ICU 3.6 */
USCRIPT_CIRTH = 67, /* Cirt */
/** @stable ICU 3.6 */
USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC = 68, /* Cyrs */
/** @stable ICU 3.6 */
USCRIPT_DEMOTIC_EGYPTIAN = 69, /* Egyd */
/** @stable ICU 3.6 */
USCRIPT_HIERATIC_EGYPTIAN = 70, /* Egyh */
/** @stable ICU 3.6 */
USCRIPT_EGYPTIAN_HIEROGLYPHS = 71, /* Egyp */
/** @stable ICU 3.6 */
USCRIPT_KHUTSURI = 72, /* Geok */
/** @stable ICU 3.6 */
USCRIPT_SIMPLIFIED_HAN = 73, /* Hans */
/** @stable ICU 3.6 */
USCRIPT_TRADITIONAL_HAN = 74, /* Hant */
/** @stable ICU 3.6 */
USCRIPT_PAHAWH_HMONG = 75, /* Hmng */
/** @stable ICU 3.6 */
USCRIPT_OLD_HUNGARIAN = 76, /* Hung */
/** @stable ICU 3.6 */
USCRIPT_HARAPPAN_INDUS = 77, /* Inds */
/** @stable ICU 3.6 */
USCRIPT_JAVANESE = 78, /* Java */
/** @stable ICU 3.6 */
USCRIPT_KAYAH_LI = 79, /* Kali */
/** @stable ICU 3.6 */
USCRIPT_LATIN_FRAKTUR = 80, /* Latf */
/** @stable ICU 3.6 */
USCRIPT_LATIN_GAELIC = 81, /* Latg */
/** @stable ICU 3.6 */
USCRIPT_LEPCHA = 82, /* Lepc */
/** @stable ICU 3.6 */
USCRIPT_LINEAR_A = 83, /* Lina */
/** @stable ICU 4.6 */
USCRIPT_MANDAIC = 84, /* Mand */
/** @stable ICU 3.6 */
USCRIPT_MANDAEAN = USCRIPT_MANDAIC,
/** @stable ICU 3.6 */
USCRIPT_MAYAN_HIEROGLYPHS = 85, /* Maya */
/** @stable ICU 4.6 */
USCRIPT_MEROITIC_HIEROGLYPHS = 86, /* Mero */
/** @stable ICU 3.6 */
USCRIPT_MEROITIC = USCRIPT_MEROITIC_HIEROGLYPHS,
/** @stable ICU 3.6 */
USCRIPT_NKO = 87, /* Nkoo */
/** @stable ICU 3.6 */
USCRIPT_ORKHON = 88, /* Orkh */
/** @stable ICU 3.6 */
USCRIPT_OLD_PERMIC = 89, /* Perm */
/** @stable ICU 3.6 */
USCRIPT_PHAGS_PA = 90, /* Phag */
/** @stable ICU 3.6 */
USCRIPT_PHOENICIAN = 91, /* Phnx */
/** @stable ICU 52 */
USCRIPT_MIAO = 92, /* Plrd */
/** @stable ICU 3.6 */
USCRIPT_PHONETIC_POLLARD = USCRIPT_MIAO,
/** @stable ICU 3.6 */
USCRIPT_RONGORONGO = 93, /* Roro */
/** @stable ICU 3.6 */
USCRIPT_SARATI = 94, /* Sara */
/** @stable ICU 3.6 */
USCRIPT_ESTRANGELO_SYRIAC = 95, /* Syre */
/** @stable ICU 3.6 */
USCRIPT_WESTERN_SYRIAC = 96, /* Syrj */
/** @stable ICU 3.6 */
USCRIPT_EASTERN_SYRIAC = 97, /* Syrn */
/** @stable ICU 3.6 */
USCRIPT_TENGWAR = 98, /* Teng */
/** @stable ICU 3.6 */
USCRIPT_VAI = 99, /* Vaii */
/** @stable ICU 3.6 */
USCRIPT_VISIBLE_SPEECH = 100,/* Visp */
/** @stable ICU 3.6 */
USCRIPT_CUNEIFORM = 101,/* Xsux */
/** @stable ICU 3.6 */
USCRIPT_UNWRITTEN_LANGUAGES = 102,/* Zxxx */
/** @stable ICU 3.6 */
USCRIPT_UNKNOWN = 103,/* Zzzz */ /* Unknown="Code for uncoded script", for unassigned code points */
/** @stable ICU 3.8 */
USCRIPT_CARIAN = 104,/* Cari */
/** @stable ICU 3.8 */
USCRIPT_JAPANESE = 105,/* Jpan */
/** @stable ICU 3.8 */
USCRIPT_LANNA = 106,/* Lana */
/** @stable ICU 3.8 */
USCRIPT_LYCIAN = 107,/* Lyci */
/** @stable ICU 3.8 */
USCRIPT_LYDIAN = 108,/* Lydi */
/** @stable ICU 3.8 */
USCRIPT_OL_CHIKI = 109,/* Olck */
/** @stable ICU 3.8 */
USCRIPT_REJANG = 110,/* Rjng */
/** @stable ICU 3.8 */
USCRIPT_SAURASHTRA = 111,/* Saur */
/** Sutton SignWriting @stable ICU 3.8 */
USCRIPT_SIGN_WRITING = 112,/* Sgnw */
/** @stable ICU 3.8 */
USCRIPT_SUNDANESE = 113,/* Sund */
/** @stable ICU 3.8 */
USCRIPT_MOON = 114,/* Moon */
/** @stable ICU 3.8 */
USCRIPT_MEITEI_MAYEK = 115,/* Mtei */
/** @stable ICU 4.0 */
USCRIPT_IMPERIAL_ARAMAIC = 116,/* Armi */
/** @stable ICU 4.0 */
USCRIPT_AVESTAN = 117,/* Avst */
/** @stable ICU 4.0 */
USCRIPT_CHAKMA = 118,/* Cakm */
/** @stable ICU 4.0 */
USCRIPT_KOREAN = 119,/* Kore */
/** @stable ICU 4.0 */
USCRIPT_KAITHI = 120,/* Kthi */
/** @stable ICU 4.0 */
USCRIPT_MANICHAEAN = 121,/* Mani */
/** @stable ICU 4.0 */
USCRIPT_INSCRIPTIONAL_PAHLAVI = 122,/* Phli */
/** @stable ICU 4.0 */
USCRIPT_PSALTER_PAHLAVI = 123,/* Phlp */
/** @stable ICU 4.0 */
USCRIPT_BOOK_PAHLAVI = 124,/* Phlv */
/** @stable ICU 4.0 */
USCRIPT_INSCRIPTIONAL_PARTHIAN = 125,/* Prti */
/** @stable ICU 4.0 */
USCRIPT_SAMARITAN = 126,/* Samr */
/** @stable ICU 4.0 */
USCRIPT_TAI_VIET = 127,/* Tavt */
/** @stable ICU 4.0 */
USCRIPT_MATHEMATICAL_NOTATION = 128,/* Zmth */
/** @stable ICU 4.0 */
USCRIPT_SYMBOLS = 129,/* Zsym */
/** @stable ICU 4.4 */
USCRIPT_BAMUM = 130,/* Bamu */
/** @stable ICU 4.4 */
USCRIPT_LISU = 131,/* Lisu */
/** @stable ICU 4.4 */
USCRIPT_NAKHI_GEBA = 132,/* Nkgb */
/** @stable ICU 4.4 */
USCRIPT_OLD_SOUTH_ARABIAN = 133,/* Sarb */
/** @stable ICU 4.6 */
USCRIPT_BASSA_VAH = 134,/* Bass */
/** @stable ICU 54 */
USCRIPT_DUPLOYAN = 135,/* Dupl */
#ifndef U_HIDE_DEPRECATED_API
/** @deprecated ICU 54 Typo, use USCRIPT_DUPLOYAN */
USCRIPT_DUPLOYAN_SHORTAND = USCRIPT_DUPLOYAN,
#endif /* U_HIDE_DEPRECATED_API */
/** @stable ICU 4.6 */
USCRIPT_ELBASAN = 136,/* Elba */
/** @stable ICU 4.6 */
USCRIPT_GRANTHA = 137,/* Gran */
/** @stable ICU 4.6 */
USCRIPT_KPELLE = 138,/* Kpel */
/** @stable ICU 4.6 */
USCRIPT_LOMA = 139,/* Loma */
/** Mende Kikakui @stable ICU 4.6 */
USCRIPT_MENDE = 140,/* Mend */
/** @stable ICU 4.6 */
USCRIPT_MEROITIC_CURSIVE = 141,/* Merc */
/** @stable ICU 4.6 */
USCRIPT_OLD_NORTH_ARABIAN = 142,/* Narb */
/** @stable ICU 4.6 */
USCRIPT_NABATAEAN = 143,/* Nbat */
/** @stable ICU 4.6 */
USCRIPT_PALMYRENE = 144,/* Palm */
/** @stable ICU 54 */
USCRIPT_KHUDAWADI = 145,/* Sind */
/** @stable ICU 4.6 */
USCRIPT_SINDHI = USCRIPT_KHUDAWADI,
/** @stable ICU 4.6 */
USCRIPT_WARANG_CITI = 146,/* Wara */
/** @stable ICU 4.8 */
USCRIPT_AFAKA = 147,/* Afak */
/** @stable ICU 4.8 */
USCRIPT_JURCHEN = 148,/* Jurc */
/** @stable ICU 4.8 */
USCRIPT_MRO = 149,/* Mroo */
/** @stable ICU 4.8 */
USCRIPT_NUSHU = 150,/* Nshu */
/** @stable ICU 4.8 */
USCRIPT_SHARADA = 151,/* Shrd */
/** @stable ICU 4.8 */
USCRIPT_SORA_SOMPENG = 152,/* Sora */
/** @stable ICU 4.8 */
USCRIPT_TAKRI = 153,/* Takr */
/** @stable ICU 4.8 */
USCRIPT_TANGUT = 154,/* Tang */
/** @stable ICU 4.8 */
USCRIPT_WOLEAI = 155,/* Wole */
/** @stable ICU 49 */
USCRIPT_ANATOLIAN_HIEROGLYPHS = 156,/* Hluw */
/** @stable ICU 49 */
USCRIPT_KHOJKI = 157,/* Khoj */
/** @stable ICU 49 */
USCRIPT_TIRHUTA = 158,/* Tirh */
/** @stable ICU 52 */
USCRIPT_CAUCASIAN_ALBANIAN = 159,/* Aghb */
/** @stable ICU 52 */
USCRIPT_MAHAJANI = 160,/* Mahj */
/** @stable ICU 54 */
USCRIPT_AHOM = 161,/* Ahom */
/** @stable ICU 54 */
USCRIPT_HATRAN = 162,/* Hatr */
/** @stable ICU 54 */
USCRIPT_MODI = 163,/* Modi */
/** @stable ICU 54 */
USCRIPT_MULTANI = 164,/* Mult */
/** @stable ICU 54 */
USCRIPT_PAU_CIN_HAU = 165,/* Pauc */
/** @stable ICU 54 */
USCRIPT_SIDDHAM = 166,/* Sidd */
/** @stable ICU 58 */
USCRIPT_ADLAM = 167,/* Adlm */
/** @stable ICU 58 */
USCRIPT_BHAIKSUKI = 168,/* Bhks */
/** @stable ICU 58 */
USCRIPT_MARCHEN = 169,/* Marc */
/** @stable ICU 58 */
USCRIPT_NEWA = 170,/* Newa */
/** @stable ICU 58 */
USCRIPT_OSAGE = 171,/* Osge */
/** @stable ICU 58 */
USCRIPT_HAN_WITH_BOPOMOFO = 172,/* Hanb */
/** @stable ICU 58 */
USCRIPT_JAMO = 173,/* Jamo */
/** @stable ICU 58 */
USCRIPT_SYMBOLS_EMOJI = 174,/* Zsye */
/** @stable ICU 60 */
USCRIPT_MASARAM_GONDI = 175,/* Gonm */
/** @stable ICU 60 */
USCRIPT_SOYOMBO = 176,/* Soyo */
/** @stable ICU 60 */
USCRIPT_ZANABAZAR_SQUARE = 177,/* Zanb */
/** @stable ICU 62 */
USCRIPT_DOGRA = 178,/* Dogr */
/** @stable ICU 62 */
USCRIPT_GUNJALA_GONDI = 179,/* Gong */
/** @stable ICU 62 */
USCRIPT_MAKASAR = 180,/* Maka */
/** @stable ICU 62 */
USCRIPT_MEDEFAIDRIN = 181,/* Medf */
/** @stable ICU 62 */
USCRIPT_HANIFI_ROHINGYA = 182,/* Rohg */
/** @stable ICU 62 */
USCRIPT_SOGDIAN = 183,/* Sogd */
/** @stable ICU 62 */
USCRIPT_OLD_SOGDIAN = 184,/* Sogo */
/** @stable ICU 64 */
USCRIPT_ELYMAIC = 185,/* Elym */
/** @stable ICU 64 */
USCRIPT_NYIAKENG_PUACHUE_HMONG = 186,/* Hmnp */
/** @stable ICU 64 */
USCRIPT_NANDINAGARI = 187,/* Nand */
/** @stable ICU 64 */
USCRIPT_WANCHO = 188,/* Wcho */
/** @stable ICU 66 */
USCRIPT_CHORASMIAN = 189,/* Chrs */
/** @stable ICU 66 */
USCRIPT_DIVES_AKURU = 190,/* Diak */
/** @stable ICU 66 */
USCRIPT_KHITAN_SMALL_SCRIPT = 191,/* Kits */
/** @stable ICU 66 */
USCRIPT_YEZIDI = 192,/* Yezi */
/** @stable ICU 70 */
USCRIPT_CYPRO_MINOAN = 193,/* Cpmn */
/** @stable ICU 70 */
USCRIPT_OLD_UYGHUR = 194,/* Ougr */
/** @stable ICU 70 */
USCRIPT_TANGSA = 195,/* Tnsa */
/** @stable ICU 70 */
USCRIPT_TOTO = 196,/* Toto */
/** @stable ICU 70 */
USCRIPT_VITHKUQI = 197,/* Vith */
/** @stable ICU 72 */
USCRIPT_KAWI = 198,/* Kawi */
/** @stable ICU 72 */
USCRIPT_NAG_MUNDARI = 199,/* Nagm */
/** @stable ICU 75 */
USCRIPT_ARABIC_NASTALIQ = 200, /* Aran */
/** @stable ICU 76 */
USCRIPT_GARAY = 201, /* Gara */
/** @stable ICU 76 */
USCRIPT_GURUNG_KHEMA = 202, /* Gukh */
/** @stable ICU 76 */
USCRIPT_KIRAT_RAI = 203, /* Krai */
/** @stable ICU 76 */
USCRIPT_OL_ONAL = 204, /* Onao */
/** @stable ICU 76 */
USCRIPT_SUNUWAR = 205, /* Sunu */
/** @stable ICU 76 */
USCRIPT_TODHRI = 206, /* Todr */
/** @stable ICU 76 */
USCRIPT_TULU_TIGALARI = 207, /* Tutg */
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the highest normal UScriptCode value.
* The highest value is available via u_getIntPropertyMaxValue(UCHAR_SCRIPT).
*
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
USCRIPT_CODE_LIMIT = 208
#endif // U_HIDE_DEPRECATED_API
} UScriptCode;
/**
* Gets the script codes associated with the given locale or ISO 15924 abbreviation or name.
* Fills in USCRIPT_MALAYALAM given "Malayam" OR "Mlym".
* Fills in USCRIPT_LATIN given "en" OR "en_US"
* If the required capacity is greater than the capacity of the destination buffer,
* then the error code is set to U_BUFFER_OVERFLOW_ERROR and the required capacity is returned.
*
* <p>Note: To search by short or long script alias only, use
* u_getPropertyValueEnum(UCHAR_SCRIPT, alias) instead. That does
* a fast lookup with no access of the locale data.
*
* @param nameOrAbbrOrLocale name of the script, as given in
* PropertyValueAliases.txt, or ISO 15924 code or locale
* @param fillIn the UScriptCode buffer to fill in the script code
* @param capacity the capacity (size) of UScriptCode buffer passed in.
* @param err the error status code.
* @return The number of script codes filled in the buffer passed in
* @stable ICU 2.4
*/
U_CAPI int32_t U_EXPORT2
uscript_getCode(const char* nameOrAbbrOrLocale,UScriptCode* fillIn,int32_t capacity,UErrorCode *err);
/**
* Returns the long Unicode script name, if there is one.
* Otherwise returns the 4-letter ISO 15924 script code.
* Returns "Malayam" given USCRIPT_MALAYALAM.
*
* @param scriptCode UScriptCode enum
* @return long script name as given in PropertyValueAliases.txt, or the 4-letter code,
* or NULL if scriptCode is invalid
* @stable ICU 2.4
*/
U_CAPI const char* U_EXPORT2
uscript_getName(UScriptCode scriptCode);
/**
* Returns the 4-letter ISO 15924 script code,
* which is the same as the short Unicode script name if Unicode has names for the script.
* Returns "Mlym" given USCRIPT_MALAYALAM.
*
* @param scriptCode UScriptCode enum
* @return short script name (4-letter code), or NULL if scriptCode is invalid
* @stable ICU 2.4
*/
U_CAPI const char* U_EXPORT2
uscript_getShortName(UScriptCode scriptCode);
/**
* Gets the script code associated with the given codepoint.
* Returns USCRIPT_MALAYALAM given 0x0D02
* @param codepoint UChar32 codepoint
* @param err the error status code.
* @return The UScriptCode, or 0 if codepoint is invalid
* @stable ICU 2.4
*/
U_CAPI UScriptCode U_EXPORT2
uscript_getScript(UChar32 codepoint, UErrorCode *err);
/**
* Do the Script_Extensions of code point c contain script sc?
* If c does not have explicit Script_Extensions, then this tests whether
* c has the Script property value sc.
*
* Some characters are commonly used in multiple scripts.
* For more information, see UAX #24: http://www.unicode.org/reports/tr24/.
* @param c code point
* @param sc script code
* @return true if sc is in Script_Extensions(c)
* @stable ICU 49
*/
U_CAPI UBool U_EXPORT2
uscript_hasScript(UChar32 c, UScriptCode sc);
/**
* Writes code point c's Script_Extensions as a list of UScriptCode values
* to the output scripts array and returns the number of script codes.
* - If c does have Script_Extensions, then the Script property value
* (normally Common or Inherited) is not included.
* - If c does not have Script_Extensions, then the one Script code is written to the output array.
* - If c is not a valid code point, then the one USCRIPT_UNKNOWN code is written.
* In other words, if the return value is 1,
* then the output array contains exactly c's single Script code.
* If the return value is n>=2, then the output array contains c's n Script_Extensions script codes.
*
* Some characters are commonly used in multiple scripts.
* For more information, see UAX #24: http://www.unicode.org/reports/tr24/.
*
* If there are more than capacity script codes to be written, then
* U_BUFFER_OVERFLOW_ERROR is set and the number of Script_Extensions is returned.
* (Usual ICU buffer handling behavior.)
*
* @param c code point
* @param scripts output script code array
* @param capacity capacity of the scripts array
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return number of script codes in c's Script_Extensions, or 1 for the single Script value,
* written to scripts unless U_BUFFER_OVERFLOW_ERROR indicates insufficient capacity
* @stable ICU 49
*/
U_CAPI int32_t U_EXPORT2
uscript_getScriptExtensions(UChar32 c,
UScriptCode *scripts, int32_t capacity,
UErrorCode *errorCode);
/**
* Script usage constants.
* See UAX #31 Unicode Identifier and Pattern Syntax.
* http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Exclusion_from_Identifiers
*
* @stable ICU 51
*/
typedef enum UScriptUsage {
/** Not encoded in Unicode. @stable ICU 51 */
USCRIPT_USAGE_NOT_ENCODED,
/** Unknown script usage. @stable ICU 51 */
USCRIPT_USAGE_UNKNOWN,
/** Candidate for Exclusion from Identifiers. @stable ICU 51 */
USCRIPT_USAGE_EXCLUDED,
/** Limited Use script. @stable ICU 51 */
USCRIPT_USAGE_LIMITED_USE,
/** Aspirational Use script. @stable ICU 51 */
USCRIPT_USAGE_ASPIRATIONAL,
/** Recommended script. @stable ICU 51 */
USCRIPT_USAGE_RECOMMENDED
} UScriptUsage;
/**
* Writes the script sample character string.
* This string normally consists of one code point but might be longer.
* The string is empty if the script is not encoded.
*
* @param script script code
* @param dest output string array
* @param capacity number of UChars in the dest array
* @param pErrorCode standard ICU in/out error code, must pass U_SUCCESS() on input
* @return the string length, even if U_BUFFER_OVERFLOW_ERROR
* @stable ICU 51
*/
U_CAPI int32_t U_EXPORT2
uscript_getSampleString(UScriptCode script, UChar *dest, int32_t capacity, UErrorCode *pErrorCode);
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
class UnicodeString;
U_NAMESPACE_END
/**
* Returns the script sample character string.
* This string normally consists of one code point but might be longer.
* The string is empty if the script is not encoded.
*
* @param script script code
* @return the sample character string
* @stable ICU 51
*/
U_COMMON_API icu::UnicodeString U_EXPORT2
uscript_getSampleUnicodeString(UScriptCode script);
#endif
/**
* Returns the script usage according to UAX #31 Unicode Identifier and Pattern Syntax.
* Returns USCRIPT_USAGE_NOT_ENCODED if the script is not encoded in Unicode.
*
* @param script script code
* @return script usage
* @see UScriptUsage
* @stable ICU 51
*/
U_CAPI UScriptUsage U_EXPORT2
uscript_getUsage(UScriptCode script);
/**
* Returns true if the script is written right-to-left.
* For example, Arab and Hebr.
*
* @param script script code
* @return true if the script is right-to-left
* @stable ICU 51
*/
U_CAPI UBool U_EXPORT2
uscript_isRightToLeft(UScriptCode script);
/**
* Returns true if the script allows line breaks between letters (excluding hyphenation).
* Such a script typically requires dictionary-based line breaking.
* For example, Hani and Thai.
*
* @param script script code
* @return true if the script allows line breaks between letters
* @stable ICU 51
*/
U_CAPI UBool U_EXPORT2
uscript_breaksBetweenLetters(UScriptCode script);
/**
* Returns true if in modern (or most recent) usage of the script case distinctions are customary.
* For example, Latn and Cyrl.
*
* @param script script code
* @return true if the script is cased
* @stable ICU 51
*/
U_CAPI UBool U_EXPORT2
uscript_isCased(UScriptCode script);
#endif

1908
thirdparty/icu4c/common/unicode/uset.h vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,323 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (c) 2002-2014, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#ifndef USETITER_H
#define USETITER_H
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/uobject.h"
#include "unicode/unistr.h"
/**
* \file
* \brief C++ API: UnicodeSetIterator iterates over the contents of a UnicodeSet.
*/
U_NAMESPACE_BEGIN
class UnicodeSet;
class UnicodeString;
/**
*
* UnicodeSetIterator iterates over the contents of a UnicodeSet. It
* iterates over either code points or code point ranges. After all
* code points or ranges have been returned, it returns the
* multicharacter strings of the UnicodeSet, if any.
*
* This class is not intended for public subclassing.
*
* <p>To iterate over code points and strings, use a loop like this:
* <pre>
* UnicodeSetIterator it(set);
* while (it.next()) {
* processItem(it.getString());
* }
* </pre>
* <p>Each item in the set is accessed as a string. Set elements
* consisting of single code points are returned as strings containing
* just the one code point.
*
* <p>To iterate over code point ranges, instead of individual code points,
* use a loop like this:
* <pre>
* UnicodeSetIterator it(set);
* while (it.nextRange()) {
* if (it.isString()) {
* processString(it.getString());
* } else {
* processCodepointRange(it.getCodepoint(), it.getCodepointEnd());
* }
* }
* </pre>
*
* To iterate over only the strings, start with <code>skipToStrings()</code>.
*
* @author M. Davis
* @stable ICU 2.4
*/
class U_COMMON_API UnicodeSetIterator final : public UObject {
/**
* Value of <tt>codepoint</tt> if the iterator points to a string.
* If <tt>codepoint == IS_STRING</tt>, then examine
* <tt>string</tt> for the current iteration result.
*/
enum { IS_STRING = -1 };
/**
* Current code point, or the special value <tt>IS_STRING</tt>, if
* the iterator points to a string.
*/
UChar32 codepoint;
/**
* When iterating over ranges using <tt>nextRange()</tt>,
* <tt>codepointEnd</tt> contains the inclusive end of the
* iteration range, if <tt>codepoint != IS_STRING</tt>. If
* iterating over code points using <tt>next()</tt>, or if
* <tt>codepoint == IS_STRING</tt>, then the value of
* <tt>codepointEnd</tt> is undefined.
*/
UChar32 codepointEnd;
/**
* If <tt>codepoint == IS_STRING</tt>, then <tt>string</tt> points
* to the current string. If <tt>codepoint != IS_STRING</tt>, the
* value of <tt>string</tt> is undefined.
*/
const UnicodeString* string;
public:
/**
* Create an iterator over the given set. The iterator is valid
* only so long as <tt>set</tt> is valid.
* @param set set to iterate over
* @stable ICU 2.4
*/
UnicodeSetIterator(const UnicodeSet& set);
/**
* Create an iterator over nothing. <tt>next()</tt> and
* <tt>nextRange()</tt> return false. This is a convenience
* constructor allowing the target to be set later.
* @stable ICU 2.4
*/
UnicodeSetIterator();
/**
* Destructor.
* @stable ICU 2.4
*/
virtual ~UnicodeSetIterator();
/**
* Returns true if the current element is a string. If so, the
* caller can retrieve it with <tt>getString()</tt>. If this
* method returns false, the current element is a code point or
* code point range, depending on whether <tt>next()</tt> or
* <tt>nextRange()</tt> was called.
* Elements of types string and codepoint can both be retrieved
* with the function <tt>getString()</tt>.
* Elements of type codepoint can also be retrieved with
* <tt>getCodepoint()</tt>.
* For ranges, <tt>getCodepoint()</tt> returns the starting codepoint
* of the range, and <tt>getCodepointEnd()</tt> returns the end
* of the range.
* @stable ICU 2.4
*/
inline UBool isString() const;
/**
* Returns the current code point, if <tt>isString()</tt> returned
* false. Otherwise returns an undefined result.
* @stable ICU 2.4
*/
inline UChar32 getCodepoint() const;
/**
* Returns the end of the current code point range, if
* <tt>isString()</tt> returned false and <tt>nextRange()</tt> was
* called. Otherwise returns an undefined result.
* @stable ICU 2.4
*/
inline UChar32 getCodepointEnd() const;
/**
* Returns the current string, if <tt>isString()</tt> returned
* true. If the current iteration item is a code point, a UnicodeString
* containing that single code point is returned.
*
* Ownership of the returned string remains with the iterator.
* The string is guaranteed to remain valid only until the iterator is
* advanced to the next item, or until the iterator is deleted.
*
* @stable ICU 2.4
*/
const UnicodeString& getString();
/**
* Skips over the remaining code points/ranges, if any.
* A following call to next() or nextRange() will yield a string, if there is one.
* No-op if next() would return false, or if it would yield a string anyway.
*
* @return *this
* @stable ICU 70
* @see UnicodeSet#strings()
*/
inline UnicodeSetIterator &skipToStrings() {
// Finish code point/range iteration.
range = endRange;
endElement = -1;
nextElement = 0;
return *this;
}
/**
* Advances the iteration position to the next element in the set,
* which can be either a single code point or a string.
* If there are no more elements in the set, return false.
*
* <p>
* If <tt>isString() == true</tt>, the value is a
* string, otherwise the value is a
* single code point. Elements of either type can be retrieved
* with the function <tt>getString()</tt>, while elements of
* consisting of a single code point can be retrieved with
* <tt>getCodepoint()</tt>
*
* <p>The order of iteration is all code points in sorted order,
* followed by all strings sorted order. Do not mix
* calls to <tt>next()</tt> and <tt>nextRange()</tt> without
* calling <tt>reset()</tt> between them. The results of doing so
* are undefined.
*
* @return true if there was another element in the set.
* @stable ICU 2.4
*/
UBool next();
/**
* Returns the next element in the set, either a code point range
* or a string. If there are no more elements in the set, return
* false. If <tt>isString() == true</tt>, the value is a
* string and can be accessed with <tt>getString()</tt>. Otherwise the value is a
* range of one or more code points from <tt>getCodepoint()</tt> to
* <tt>getCodepointeEnd()</tt> inclusive.
*
* <p>The order of iteration is all code points ranges in sorted
* order, followed by all strings sorted order. Ranges are
* disjoint and non-contiguous. The value returned from <tt>getString()</tt>
* is undefined unless <tt>isString() == true</tt>. Do not mix calls to
* <tt>next()</tt> and <tt>nextRange()</tt> without calling
* <tt>reset()</tt> between them. The results of doing so are
* undefined.
*
* @return true if there was another element in the set.
* @stable ICU 2.4
*/
UBool nextRange();
/**
* Sets this iterator to visit the elements of the given set and
* resets it to the start of that set. The iterator is valid only
* so long as <tt>set</tt> is valid.
* @param set the set to iterate over.
* @stable ICU 2.4
*/
void reset(const UnicodeSet& set);
/**
* Resets this iterator to the start of the set.
* @stable ICU 2.4
*/
void reset();
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*
* @stable ICU 2.4
*/
static UClassID U_EXPORT2 getStaticClassID();
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*
* @stable ICU 2.4
*/
virtual UClassID getDynamicClassID() const override;
// ======================= PRIVATES ===========================
private:
// endElement and nextElements are really UChar32's, but we keep
// them as signed int32_t's so we can do comparisons with
// endElement set to -1. Leave them as int32_t's.
/** The set
*/
const UnicodeSet* set;
/** End range
*/
int32_t endRange;
/** Range
*/
int32_t range;
/** End element
*/
int32_t endElement;
/** Next element
*/
int32_t nextElement;
/** Next string
*/
int32_t nextString;
/** String count
*/
int32_t stringCount;
/**
* Points to the string to use when the caller asks for a
* string and the current iteration item is a code point, not a string.
*/
UnicodeString *cpString;
/** Copy constructor. Disallowed.
*/
UnicodeSetIterator(const UnicodeSetIterator&) = delete;
/** Assignment operator. Disallowed.
*/
UnicodeSetIterator& operator=(const UnicodeSetIterator&) = delete;
/** Load range
*/
void loadRange(int32_t range);
};
inline UBool UnicodeSetIterator::isString() const {
return codepoint < 0;
}
inline UChar32 UnicodeSetIterator::getCodepoint() const {
return codepoint;
}
inline UChar32 UnicodeSetIterator::getCodepointEnd() const {
return codepointEnd;
}
U_NAMESPACE_END
#endif /* U_SHOW_CPLUSPLUS_API */
#endif

476
thirdparty/icu4c/common/unicode/ushape.h vendored Normal file
View File

@@ -0,0 +1,476 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
* Copyright (C) 2000-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: ushape.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2000jun29
* created by: Markus W. Scherer
*/
#ifndef __USHAPE_H__
#define __USHAPE_H__
#include "unicode/utypes.h"
/**
* \file
* \brief C API: Arabic shaping
*
*/
/**
* Shape Arabic text on a character basis.
*
* <p>This function performs basic operations for "shaping" Arabic text. It is most
* useful for use with legacy data formats and legacy display technology
* (simple terminals). All operations are performed on Unicode characters.</p>
*
* <p>Text-based shaping means that some character code points in the text are
* replaced by others depending on the context. It transforms one kind of text
* into another. In comparison, modern displays for Arabic text select
* appropriate, context-dependent font glyphs for each text element, which means
* that they transform text into a glyph vector.</p>
*
* <p>Text transformations are necessary when modern display technology is not
* available or when text needs to be transformed to or from legacy formats that
* use "shaped" characters. Since the Arabic script is cursive, connecting
* adjacent letters to each other, computers select images for each letter based
* on the surrounding letters. This usually results in four images per Arabic
* letter: initial, middle, final, and isolated forms. In Unicode, on the other
* hand, letters are normally stored abstract, and a display system is expected
* to select the necessary glyphs. (This makes searching and other text
* processing easier because the same letter has only one code.) It is possible
* to mimic this with text transformations because there are characters in
* Unicode that are rendered as letters with a specific shape
* (or cursive connectivity). They were included for interoperability with
* legacy systems and codepages, and for unsophisticated display systems.</p>
*
* <p>A second kind of text transformations is supported for Arabic digits:
* For compatibility with legacy codepages that only include European digits,
* it is possible to replace one set of digits by another, changing the
* character code points. These operations can be performed for either
* Arabic-Indic Digits (U+0660...U+0669) or Eastern (Extended) Arabic-Indic
* digits (U+06f0...U+06f9).</p>
*
* <p>Some replacements may result in more or fewer characters (code points).
* By default, this means that the destination buffer may receive text with a
* length different from the source length. Some legacy systems rely on the
* length of the text to be constant. They expect extra spaces to be added
* or consumed either next to the affected character or at the end of the
* text.</p>
*
* <p>For details about the available operations, see the description of the
* <code>U_SHAPE_...</code> options.</p>
*
* @param source The input text.
*
* @param sourceLength The number of UChars in <code>source</code>.
*
* @param dest The destination buffer that will receive the results of the
* requested operations. It may be <code>NULL</code> only if
* <code>destSize</code> is 0. The source and destination must not
* overlap.
*
* @param destSize The size (capacity) of the destination buffer in UChars.
* If <code>destSize</code> is 0, then no output is produced,
* but the necessary buffer size is returned ("preflighting").
*
* @param options This is a 32-bit set of flags that specify the operations
* that are performed on the input text. If no error occurs,
* then the result will always be written to the destination
* buffer.
*
* @param pErrorCode must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
*
* @return The number of UChars written to the destination buffer.
* If an error occurred, then no output was written, or it may be
* incomplete. If <code>U_BUFFER_OVERFLOW_ERROR</code> is set, then
* the return value indicates the necessary destination buffer size.
* @stable ICU 2.0
*/
U_CAPI int32_t U_EXPORT2
u_shapeArabic(const UChar *source, int32_t sourceLength,
UChar *dest, int32_t destSize,
uint32_t options,
UErrorCode *pErrorCode);
/**
* Memory option: allow the result to have a different length than the source.
* Affects: LamAlef options
* @stable ICU 2.0
*/
#define U_SHAPE_LENGTH_GROW_SHRINK 0
/**
* Memory option: allow the result to have a different length than the source.
* Affects: LamAlef options
* This option is an alias to U_SHAPE_LENGTH_GROW_SHRINK
* @stable ICU 4.2
*/
#define U_SHAPE_LAMALEF_RESIZE 0
/**
* Memory option: the result must have the same length as the source.
* If more room is necessary, then try to consume spaces next to modified characters.
* @stable ICU 2.0
*/
#define U_SHAPE_LENGTH_FIXED_SPACES_NEAR 1
/**
* Memory option: the result must have the same length as the source.
* If more room is necessary, then try to consume spaces next to modified characters.
* Affects: LamAlef options
* This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_NEAR
* @stable ICU 4.2
*/
#define U_SHAPE_LAMALEF_NEAR 1
/**
* Memory option: the result must have the same length as the source.
* If more room is necessary, then try to consume spaces at the end of the text.
* @stable ICU 2.0
*/
#define U_SHAPE_LENGTH_FIXED_SPACES_AT_END 2
/**
* Memory option: the result must have the same length as the source.
* If more room is necessary, then try to consume spaces at the end of the text.
* Affects: LamAlef options
* This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_AT_END
* @stable ICU 4.2
*/
#define U_SHAPE_LAMALEF_END 2
/**
* Memory option: the result must have the same length as the source.
* If more room is necessary, then try to consume spaces at the beginning of the text.
* @stable ICU 2.0
*/
#define U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING 3
/**
* Memory option: the result must have the same length as the source.
* If more room is necessary, then try to consume spaces at the beginning of the text.
* Affects: LamAlef options
* This option is an alias to U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING
* @stable ICU 4.2
*/
#define U_SHAPE_LAMALEF_BEGIN 3
/**
* Memory option: the result must have the same length as the source.
* Shaping Mode: For each LAMALEF character found, expand LAMALEF using space at end.
* If there is no space at end, use spaces at beginning of the buffer. If there
* is no space at beginning of the buffer, use spaces at the near (i.e. the space
* after the LAMALEF character).
* If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h)
* will be set in pErrorCode
*
* Deshaping Mode: Perform the same function as the flag equals U_SHAPE_LAMALEF_END.
* Affects: LamAlef options
* @stable ICU 4.2
*/
#define U_SHAPE_LAMALEF_AUTO 0x10000
/** Bit mask for memory options. @stable ICU 2.0 */
#define U_SHAPE_LENGTH_MASK 0x10003 /* Changed old value 3 */
/**
* Bit mask for LamAlef memory options.
* @stable ICU 4.2
*/
#define U_SHAPE_LAMALEF_MASK 0x10003 /* updated */
/** Direction indicator: the source is in logical (keyboard) order. @stable ICU 2.0 */
#define U_SHAPE_TEXT_DIRECTION_LOGICAL 0
/**
* Direction indicator:
* the source is in visual RTL order,
* the rightmost displayed character stored first.
* This option is an alias to U_SHAPE_TEXT_DIRECTION_LOGICAL
* @stable ICU 4.2
*/
#define U_SHAPE_TEXT_DIRECTION_VISUAL_RTL 0
/**
* Direction indicator:
* the source is in visual LTR order,
* the leftmost displayed character stored first.
* @stable ICU 2.0
*/
#define U_SHAPE_TEXT_DIRECTION_VISUAL_LTR 4
/** Bit mask for direction indicators. @stable ICU 2.0 */
#define U_SHAPE_TEXT_DIRECTION_MASK 4
/** Letter shaping option: do not perform letter shaping. @stable ICU 2.0 */
#define U_SHAPE_LETTERS_NOOP 0
/** Letter shaping option: replace abstract letter characters by "shaped" ones. @stable ICU 2.0 */
#define U_SHAPE_LETTERS_SHAPE 8
/** Letter shaping option: replace "shaped" letter characters by abstract ones. @stable ICU 2.0 */
#define U_SHAPE_LETTERS_UNSHAPE 0x10
/**
* Letter shaping option: replace abstract letter characters by "shaped" ones.
* The only difference with U_SHAPE_LETTERS_SHAPE is that Tashkeel letters
* are always "shaped" into the isolated form instead of the medial form
* (selecting code points from the Arabic Presentation Forms-B block).
* @stable ICU 2.0
*/
#define U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED 0x18
/** Bit mask for letter shaping options. @stable ICU 2.0 */
#define U_SHAPE_LETTERS_MASK 0x18
/** Digit shaping option: do not perform digit shaping. @stable ICU 2.0 */
#define U_SHAPE_DIGITS_NOOP 0
/**
* Digit shaping option:
* Replace European digits (U+0030...) by Arabic-Indic digits.
* @stable ICU 2.0
*/
#define U_SHAPE_DIGITS_EN2AN 0x20
/**
* Digit shaping option:
* Replace Arabic-Indic digits by European digits (U+0030...).
* @stable ICU 2.0
*/
#define U_SHAPE_DIGITS_AN2EN 0x40
/**
* Digit shaping option:
* Replace European digits (U+0030...) by Arabic-Indic digits if the most recent
* strongly directional character is an Arabic letter
* (<code>u_charDirection()</code> result <code>U_RIGHT_TO_LEFT_ARABIC</code> [AL]).<br>
* The direction of "preceding" depends on the direction indicator option.
* For the first characters, the preceding strongly directional character
* (initial state) is assumed to be not an Arabic letter
* (it is <code>U_LEFT_TO_RIGHT</code> [L] or <code>U_RIGHT_TO_LEFT</code> [R]).
* @stable ICU 2.0
*/
#define U_SHAPE_DIGITS_ALEN2AN_INIT_LR 0x60
/**
* Digit shaping option:
* Replace European digits (U+0030...) by Arabic-Indic digits if the most recent
* strongly directional character is an Arabic letter
* (<code>u_charDirection()</code> result <code>U_RIGHT_TO_LEFT_ARABIC</code> [AL]).<br>
* The direction of "preceding" depends on the direction indicator option.
* For the first characters, the preceding strongly directional character
* (initial state) is assumed to be an Arabic letter.
* @stable ICU 2.0
*/
#define U_SHAPE_DIGITS_ALEN2AN_INIT_AL 0x80
/** Not a valid option value. May be replaced by a new option. @stable ICU 2.0 */
#define U_SHAPE_DIGITS_RESERVED 0xa0
/** Bit mask for digit shaping options. @stable ICU 2.0 */
#define U_SHAPE_DIGITS_MASK 0xe0
/** Digit type option: Use Arabic-Indic digits (U+0660...U+0669). @stable ICU 2.0 */
#define U_SHAPE_DIGIT_TYPE_AN 0
/** Digit type option: Use Eastern (Extended) Arabic-Indic digits (U+06f0...U+06f9). @stable ICU 2.0 */
#define U_SHAPE_DIGIT_TYPE_AN_EXTENDED 0x100
/** Not a valid option value. May be replaced by a new option. @stable ICU 2.0 */
#define U_SHAPE_DIGIT_TYPE_RESERVED 0x200
/** Bit mask for digit type options. @stable ICU 2.0 */
#define U_SHAPE_DIGIT_TYPE_MASK 0x300 /* I need to change this from 0x3f00 to 0x300 */
/**
* Tashkeel aggregation option:
* Replaces any combination of U+0651 with one of
* U+064C, U+064D, U+064E, U+064F, U+0650 with
* U+FC5E, U+FC5F, U+FC60, U+FC61, U+FC62 consecutively.
* @stable ICU 3.6
*/
#define U_SHAPE_AGGREGATE_TASHKEEL 0x4000
/** Tashkeel aggregation option: do not aggregate tashkeels. @stable ICU 3.6 */
#define U_SHAPE_AGGREGATE_TASHKEEL_NOOP 0
/** Bit mask for tashkeel aggregation. @stable ICU 3.6 */
#define U_SHAPE_AGGREGATE_TASHKEEL_MASK 0x4000
/**
* Presentation form option:
* Don't replace Arabic Presentation Forms-A and Arabic Presentation Forms-B
* characters with 0+06xx characters, before shaping.
* @stable ICU 3.6
*/
#define U_SHAPE_PRESERVE_PRESENTATION 0x8000
/** Presentation form option:
* Replace Arabic Presentation Forms-A and Arabic Presentationo Forms-B with
* their unshaped correspondents in range 0+06xx, before shaping.
* @stable ICU 3.6
*/
#define U_SHAPE_PRESERVE_PRESENTATION_NOOP 0
/** Bit mask for preserve presentation form. @stable ICU 3.6 */
#define U_SHAPE_PRESERVE_PRESENTATION_MASK 0x8000
/* Seen Tail option */
/**
* Memory option: the result must have the same length as the source.
* Shaping mode: The SEEN family character will expand into two characters using space near
* the SEEN family character(i.e. the space after the character).
* If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h)
* will be set in pErrorCode
*
* De-shaping mode: Any Seen character followed by Tail character will be
* replaced by one cell Seen and a space will replace the Tail.
* Affects: Seen options
* @stable ICU 4.2
*/
#define U_SHAPE_SEEN_TWOCELL_NEAR 0x200000
/**
* Bit mask for Seen memory options.
* @stable ICU 4.2
*/
#define U_SHAPE_SEEN_MASK 0x700000
/* YehHamza option */
/**
* Memory option: the result must have the same length as the source.
* Shaping mode: The YEHHAMZA character will expand into two characters using space near it
* (i.e. the space after the character
* If there are no spaces found, an error U_NO_SPACE_AVAILABLE (as defined in utypes.h)
* will be set in pErrorCode
*
* De-shaping mode: Any Yeh (final or isolated) character followed by Hamza character will be
* replaced by one cell YehHamza and space will replace the Hamza.
* Affects: YehHamza options
* @stable ICU 4.2
*/
#define U_SHAPE_YEHHAMZA_TWOCELL_NEAR 0x1000000
/**
* Bit mask for YehHamza memory options.
* @stable ICU 4.2
*/
#define U_SHAPE_YEHHAMZA_MASK 0x3800000
/* New Tashkeel options */
/**
* Memory option: the result must have the same length as the source.
* Shaping mode: Tashkeel characters will be replaced by spaces.
* Spaces will be placed at beginning of the buffer
*
* De-shaping mode: N/A
* Affects: Tashkeel options
* @stable ICU 4.2
*/
#define U_SHAPE_TASHKEEL_BEGIN 0x40000
/**
* Memory option: the result must have the same length as the source.
* Shaping mode: Tashkeel characters will be replaced by spaces.
* Spaces will be placed at end of the buffer
*
* De-shaping mode: N/A
* Affects: Tashkeel options
* @stable ICU 4.2
*/
#define U_SHAPE_TASHKEEL_END 0x60000
/**
* Memory option: allow the result to have a different length than the source.
* Shaping mode: Tashkeel characters will be removed, buffer length will shrink.
* De-shaping mode: N/A
*
* Affect: Tashkeel options
* @stable ICU 4.2
*/
#define U_SHAPE_TASHKEEL_RESIZE 0x80000
/**
* Memory option: the result must have the same length as the source.
* Shaping mode: Tashkeel characters will be replaced by Tatweel if it is connected to adjacent
* characters (i.e. shaped on Tatweel) or replaced by space if it is not connected.
*
* De-shaping mode: N/A
* Affects: YehHamza options
* @stable ICU 4.2
*/
#define U_SHAPE_TASHKEEL_REPLACE_BY_TATWEEL 0xC0000
/**
* Bit mask for Tashkeel replacement with Space or Tatweel memory options.
* @stable ICU 4.2
*/
#define U_SHAPE_TASHKEEL_MASK 0xE0000
/* Space location Control options */
/**
* This option affect the meaning of BEGIN and END options. if this option is not used the default
* for BEGIN and END will be as following:
* The Default (for both Visual LTR, Visual RTL and Logical Text)
* 1. BEGIN always refers to the start address of physical memory.
* 2. END always refers to the end address of physical memory.
*
* If this option is used it will swap the meaning of BEGIN and END only for Visual LTR text.
*
* The effect on BEGIN and END Memory Options will be as following:
* A. BEGIN For Visual LTR text: This will be the beginning (right side) of the visual text(
* corresponding to the physical memory address end for Visual LTR text, Same as END in
* default behavior)
* B. BEGIN For Logical text: Same as BEGIN in default behavior.
* C. END For Visual LTR text: This will be the end (left side) of the visual text (corresponding
* to the physical memory address beginning for Visual LTR text, Same as BEGIN in default behavior.
* D. END For Logical text: Same as END in default behavior).
* Affects: All LamAlef BEGIN, END and AUTO options.
* @stable ICU 4.2
*/
#define U_SHAPE_SPACES_RELATIVE_TO_TEXT_BEGIN_END 0x4000000
/**
* Bit mask for swapping BEGIN and END for Visual LTR text
* @stable ICU 4.2
*/
#define U_SHAPE_SPACES_RELATIVE_TO_TEXT_MASK 0x4000000
/**
* If this option is used, shaping will use the new Unicode code point for TAIL (i.e. 0xFE73).
* If this option is not specified (Default), old unofficial Unicode TAIL code point is used (i.e. 0x200B)
* De-shaping will not use this option as it will always search for both the new Unicode code point for the
* TAIL (i.e. 0xFE73) or the old unofficial Unicode TAIL code point (i.e. 0x200B) and de-shape the
* Seen-Family letter accordingly.
*
* Shaping Mode: Only shaping.
* De-shaping Mode: N/A.
* Affects: All Seen options
* @stable ICU 4.8
*/
#define U_SHAPE_TAIL_NEW_UNICODE 0x8000000
/**
* Bit mask for new Unicode Tail option
* @stable ICU 4.8
*/
#define U_SHAPE_TAIL_TYPE_MASK 0x8000000
#endif

274
thirdparty/icu4c/common/unicode/usprep.h vendored Normal file
View File

@@ -0,0 +1,274 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2003-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: usprep.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003jul2
* created by: Ram Viswanadha
*/
#ifndef __USPREP_H__
#define __USPREP_H__
/**
* \file
* \brief C API: Implements the StringPrep algorithm.
*/
#include "unicode/utypes.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/localpointer.h"
#endif // U_SHOW_CPLUSPLUS_API
/**
*
* StringPrep API implements the StingPrep framework as described by RFC 3454.
* StringPrep prepares Unicode strings for use in network protocols.
* Profiles of StingPrep are set of rules and data according to with the
* Unicode Strings are prepared. Each profiles contains tables which describe
* how a code point should be treated. The tables are broadly classified into
* <ul>
* <li> Unassigned Table: Contains code points that are unassigned
* in the Unicode Version supported by StringPrep. Currently
* RFC 3454 supports Unicode 3.2. </li>
* <li> Prohibited Table: Contains code points that are prohibited from
* the output of the StringPrep processing function. </li>
* <li> Mapping Table: Contains code points that are deleted from the output or case mapped. </li>
* </ul>
*
* The procedure for preparing Unicode strings:
* <ol>
* <li> Map: For each character in the input, check if it has a mapping
* and, if so, replace it with its mapping. </li>
* <li> Normalize: Possibly normalize the result of step 1 using Unicode
* normalization. </li>
* <li> Prohibit: Check for any characters that are not allowed in the
* output. If any are found, return an error.</li>
* <li> Check bidi: Possibly check for right-to-left characters, and if
* any are found, make sure that the whole string satisfies the
* requirements for bidirectional strings. If the string does not
* satisfy the requirements for bidirectional strings, return an
* error. </li>
* </ol>
* @author Ram Viswanadha
*/
#if !UCONFIG_NO_IDNA
#include "unicode/parseerr.h"
/**
* The StringPrep profile
* @stable ICU 2.8
*/
typedef struct UStringPrepProfile UStringPrepProfile;
/**
* Option to prohibit processing of unassigned code points in the input
*
* @see usprep_prepare
* @stable ICU 2.8
*/
#define USPREP_DEFAULT 0x0000
/**
* Option to allow processing of unassigned code points in the input
*
* @see usprep_prepare
* @stable ICU 2.8
*/
#define USPREP_ALLOW_UNASSIGNED 0x0001
/**
* enums for the standard stringprep profile types
* supported by usprep_openByType.
* @see usprep_openByType
* @stable ICU 4.2
*/
typedef enum UStringPrepProfileType {
/**
* RFC3491 Nameprep
* @stable ICU 4.2
*/
USPREP_RFC3491_NAMEPREP,
/**
* RFC3530 nfs4_cs_prep
* @stable ICU 4.2
*/
USPREP_RFC3530_NFS4_CS_PREP,
/**
* RFC3530 nfs4_cs_prep with case insensitive option
* @stable ICU 4.2
*/
USPREP_RFC3530_NFS4_CS_PREP_CI,
/**
* RFC3530 nfs4_cis_prep
* @stable ICU 4.2
*/
USPREP_RFC3530_NFS4_CIS_PREP,
/**
* RFC3530 nfs4_mixed_prep for prefix
* @stable ICU 4.2
*/
USPREP_RFC3530_NFS4_MIXED_PREP_PREFIX,
/**
* RFC3530 nfs4_mixed_prep for suffix
* @stable ICU 4.2
*/
USPREP_RFC3530_NFS4_MIXED_PREP_SUFFIX,
/**
* RFC3722 iSCSI
* @stable ICU 4.2
*/
USPREP_RFC3722_ISCSI,
/**
* RFC3920 XMPP Nodeprep
* @stable ICU 4.2
*/
USPREP_RFC3920_NODEPREP,
/**
* RFC3920 XMPP Resourceprep
* @stable ICU 4.2
*/
USPREP_RFC3920_RESOURCEPREP,
/**
* RFC4011 Policy MIB Stringprep
* @stable ICU 4.2
*/
USPREP_RFC4011_MIB,
/**
* RFC4013 SASLprep
* @stable ICU 4.2
*/
USPREP_RFC4013_SASLPREP,
/**
* RFC4505 trace
* @stable ICU 4.2
*/
USPREP_RFC4505_TRACE,
/**
* RFC4518 LDAP
* @stable ICU 4.2
*/
USPREP_RFC4518_LDAP,
/**
* RFC4518 LDAP for case ignore, numeric and stored prefix
* matching rules
* @stable ICU 4.2
*/
USPREP_RFC4518_LDAP_CI
} UStringPrepProfileType;
/**
* Creates a StringPrep profile from the data file.
*
* @param path string containing the full path pointing to the directory
* where the profile reside followed by the package name
* e.g. "/usr/resource/my_app/profiles/mydata" on a Unix system.
* if NULL, ICU default data files will be used.
* @param fileName name of the profile file to be opened
* @param status ICU error code in/out parameter. Must not be NULL.
* Must fulfill U_SUCCESS before the function call.
* @return Pointer to UStringPrepProfile that is opened. Should be closed by
* calling usprep_close()
* @see usprep_close()
* @stable ICU 2.8
*/
U_CAPI UStringPrepProfile* U_EXPORT2
usprep_open(const char* path,
const char* fileName,
UErrorCode* status);
/**
* Creates a StringPrep profile for the specified profile type.
*
* @param type The profile type
* @param status ICU error code in/out parameter. Must not be NULL.
* Must fulfill U_SUCCESS before the function call.
* @return Pointer to UStringPrepProfile that is opened. Should be closed by
* calling usprep_close()
* @see usprep_close()
* @stable ICU 4.2
*/
U_CAPI UStringPrepProfile* U_EXPORT2
usprep_openByType(UStringPrepProfileType type,
UErrorCode* status);
/**
* Closes the profile
* @param profile The profile to close
* @stable ICU 2.8
*/
U_CAPI void U_EXPORT2
usprep_close(UStringPrepProfile* profile);
#if U_SHOW_CPLUSPLUS_API
U_NAMESPACE_BEGIN
/**
* \class LocalUStringPrepProfilePointer
* "Smart pointer" class, closes a UStringPrepProfile via usprep_close().
* For most methods see the LocalPointerBase base class.
*
* @see LocalPointerBase
* @see LocalPointer
* @stable ICU 4.4
*/
U_DEFINE_LOCAL_OPEN_POINTER(LocalUStringPrepProfilePointer, UStringPrepProfile, usprep_close);
U_NAMESPACE_END
#endif
/**
* Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
* checks for prohibited and BiDi characters in the order defined by RFC 3454
* depending on the options specified in the profile.
*
* @param prep The profile to use
* @param src Pointer to UChar buffer containing the string to prepare
* @param srcLength Number of characters in the source string
* @param dest Pointer to the destination buffer to receive the output
* @param destCapacity The capacity of destination array
* @param options A bit set of options:
*
* - USPREP_DEFAULT Prohibit processing of unassigned code points in the input
*
* - USPREP_ALLOW_UNASSIGNED Treat the unassigned code points are in the input
* as normal Unicode code points.
*
* @param parseError Pointer to UParseError struct to receive information on position
* of error if an error is encountered. Can be NULL.
* @param status ICU in/out error code parameter.
* U_INVALID_CHAR_FOUND if src contains
* unmatched single surrogates.
* U_INDEX_OUTOFBOUNDS_ERROR if src contains
* too many code points.
* U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
* @return The number of UChars in the destination buffer
* @stable ICU 2.8
*/
U_CAPI int32_t U_EXPORT2
usprep_prepare( const UStringPrepProfile* prep,
const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
int32_t options,
UParseError* parseError,
UErrorCode* status );
#endif /* #if !UCONFIG_NO_IDNA */
#endif

1685
thirdparty/icu4c/common/unicode/ustring.h vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,97 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2010-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: udicttrie.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010dec17
* created by: Markus W. Scherer
*/
#ifndef __USTRINGTRIE_H__
#define __USTRINGTRIE_H__
/**
* \file
* \brief C API: Helper definitions for dictionary trie APIs.
*/
#include "unicode/utypes.h"
/**
* Return values for BytesTrie::next(), UCharsTrie::next() and similar methods.
* @see USTRINGTRIE_MATCHES
* @see USTRINGTRIE_HAS_VALUE
* @see USTRINGTRIE_HAS_NEXT
* @stable ICU 4.8
*/
enum UStringTrieResult {
/**
* The input unit(s) did not continue a matching string.
* Once current()/next() return USTRINGTRIE_NO_MATCH,
* all further calls to current()/next() will also return USTRINGTRIE_NO_MATCH,
* until the trie is reset to its original state or to a saved state.
* @stable ICU 4.8
*/
USTRINGTRIE_NO_MATCH,
/**
* The input unit(s) continued a matching string
* but there is no value for the string so far.
* (It is a prefix of a longer string.)
* @stable ICU 4.8
*/
USTRINGTRIE_NO_VALUE,
/**
* The input unit(s) continued a matching string
* and there is a value for the string so far.
* This value will be returned by getValue().
* No further input byte/unit can continue a matching string.
* @stable ICU 4.8
*/
USTRINGTRIE_FINAL_VALUE,
/**
* The input unit(s) continued a matching string
* and there is a value for the string so far.
* This value will be returned by getValue().
* Another input byte/unit can continue a matching string.
* @stable ICU 4.8
*/
USTRINGTRIE_INTERMEDIATE_VALUE
};
/**
* Same as (result!=USTRINGTRIE_NO_MATCH).
* @param result A result from BytesTrie::first(), UCharsTrie::next() etc.
* @return true if the input bytes/units so far are part of a matching string/byte sequence.
* @stable ICU 4.8
*/
#define USTRINGTRIE_MATCHES(result) ((result)!=USTRINGTRIE_NO_MATCH)
/**
* Equivalent to (result==USTRINGTRIE_INTERMEDIATE_VALUE || result==USTRINGTRIE_FINAL_VALUE) but
* this macro evaluates result exactly once.
* @param result A result from BytesTrie::first(), UCharsTrie::next() etc.
* @return true if there is a value for the input bytes/units so far.
* @see BytesTrie::getValue
* @see UCharsTrie::getValue
* @stable ICU 4.8
*/
#define USTRINGTRIE_HAS_VALUE(result) ((result)>=USTRINGTRIE_FINAL_VALUE)
/**
* Equivalent to (result==USTRINGTRIE_NO_VALUE || result==USTRINGTRIE_INTERMEDIATE_VALUE) but
* this macro evaluates result exactly once.
* @param result A result from BytesTrie::first(), UCharsTrie::next() etc.
* @return true if another input byte/unit can continue a matching string.
* @stable ICU 4.8
*/
#define USTRINGTRIE_HAS_NEXT(result) ((result)&1)
#endif /* __USTRINGTRIE_H__ */

1603
thirdparty/icu4c/common/unicode/utext.h vendored Normal file

File diff suppressed because it is too large Load Diff

225
thirdparty/icu4c/common/unicode/utf.h vendored Normal file
View File

@@ -0,0 +1,225 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 1999-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: utf.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999sep09
* created by: Markus W. Scherer
*/
/**
* \file
* \brief C API: Code point macros
*
* This file defines macros for checking whether a code point is
* a surrogate or a non-character etc.
*
* If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 0 then utf.h is included by utypes.h
* and itself includes utf8.h and utf16.h after some
* common definitions.
* If U_NO_DEFAULT_INCLUDE_UTF_HEADERS is 1 then each of these headers must be
* included explicitly if their definitions are used.
*
* utf8.h and utf16.h define macros for efficiently getting code points
* in and out of UTF-8/16 strings.
* utf16.h macros have "U16_" prefixes.
* utf8.h defines similar macros with "U8_" prefixes for UTF-8 string handling.
*
* ICU mostly processes 16-bit Unicode strings.
* Most of the time, such strings are well-formed UTF-16.
* Single, unpaired surrogates must be handled as well, and are treated in ICU
* like regular code points where possible.
* (Pairs of surrogate code points are indistinguishable from supplementary
* code points encoded as pairs of supplementary code units.)
*
* In fact, almost all Unicode code points in normal text (>99%)
* are on the BMP (<=U+ffff) and even <=U+d7ff.
* ICU functions handle supplementary code points (U+10000..U+10ffff)
* but are optimized for the much more frequently occurring BMP code points.
*
* umachine.h defines UChar to be an unsigned 16-bit integer.
* Since ICU 59, ICU uses char16_t in C++, UChar only in C,
* and defines UChar=char16_t by default. See the UChar API docs for details.
*
* UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit
* Unicode code point (Unicode scalar value, 0..0x10ffff) and U_SENTINEL (-1).
* Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as
* the definition of UChar. For details see the documentation for UChar32 itself.
*
* utf.h defines a small number of C macros for single Unicode code points.
* These are simple checks for surrogates and non-characters.
* For actual Unicode character properties see uchar.h.
*
* By default, string operations must be done with error checking in case
* a string is not well-formed UTF-16 or UTF-8.
*
* The U16_ macros detect if a surrogate code unit is unpaired
* (lead unit without trail unit or vice versa) and just return the unit itself
* as the code point.
*
* The U8_ macros detect illegal byte sequences and return a negative value.
* Starting with ICU 60, the observable length of a single illegal byte sequence
* skipped by one of these macros follows the Unicode 6+ recommendation
* which is consistent with the W3C Encoding Standard.
*
* There are ..._OR_FFFD versions of both U16_ and U8_ macros
* that return U+FFFD for illegal code unit sequences.
*
* The regular "safe" macros require that the initial, passed-in string index
* is within bounds. They only check the index when they read more than one
* code unit. This is usually done with code similar to the following loop:
* <pre>while(i<length) {
* U16_NEXT(s, i, length, c);
* // use c
* }</pre>
*
* When it is safe to assume that text is well-formed UTF-16
* (does not contain single, unpaired surrogates), then one can use
* U16_..._UNSAFE macros.
* These do not check for proper code unit sequences or truncated text and may
* yield wrong results or even cause a crash if they are used with "malformed"
* text.
* In practice, U16_..._UNSAFE macros will produce slightly less code but
* should not be faster because the processing is only different when a
* surrogate code unit is detected, which will be rare.
*
* Similarly for UTF-8, there are "safe" macros without a suffix,
* and U8_..._UNSAFE versions.
* The performance differences are much larger here because UTF-8 provides so
* many opportunities for malformed sequences.
* The unsafe UTF-8 macros are entirely implemented inside the macro definitions
* and are fast, while the safe UTF-8 macros call functions for some complicated cases.
*
* Unlike with UTF-16, malformed sequences cannot be expressed with distinct
* code point values (0..U+10ffff). They are indicated with negative values instead.
*
* For more information see the ICU User Guide Strings chapter
* (https://unicode-org.github.io/icu/userguide/strings).
*
* <em>Usage:</em>
* ICU coding guidelines for if() statements should be followed when using these macros.
* Compound statements (curly braces {}) must be used for if-else-while...
* bodies and all macro statements should be terminated with semicolon.
*
* @stable ICU 2.4
*/
#ifndef __UTF_H__
#define __UTF_H__
#include "unicode/umachine.h"
/* include the utfXX.h after the following definitions */
/* single-code point definitions -------------------------------------------- */
/**
* Is this code point a Unicode noncharacter?
* @param c 32-bit code point
* @return true or false
* @stable ICU 2.4
*/
#define U_IS_UNICODE_NONCHAR(c) \
((c)>=0xfdd0 && \
((c)<=0xfdef || ((c)&0xfffe)==0xfffe) && (c)<=0x10ffff)
/**
* Is c a Unicode code point value (0..U+10ffff)
* that can be assigned a character?
*
* Code points that are not characters include:
* - single surrogate code points (U+d800..U+dfff, 2048 code points)
* - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points)
* - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points)
* - the highest Unicode code point value is U+10ffff
*
* This means that all code points below U+d800 are character code points,
* and that boundary is tested first for performance.
*
* @param c 32-bit code point
* @return true or false
* @stable ICU 2.4
*/
#define U_IS_UNICODE_CHAR(c) \
((uint32_t)(c)<0xd800 || \
(0xdfff<(c) && (c)<=0x10ffff && !U_IS_UNICODE_NONCHAR(c)))
/**
* Is this code point a BMP code point (U+0000..U+ffff)?
* @param c 32-bit code point
* @return true or false
* @stable ICU 2.8
*/
#define U_IS_BMP(c) ((uint32_t)(c)<=0xffff)
/**
* Is this code point a supplementary code point (U+10000..U+10ffff)?
* @param c 32-bit code point
* @return true or false
* @stable ICU 2.8
*/
#define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x10000)<=0xfffff)
/**
* Is this code point a lead surrogate (U+d800..U+dbff)?
* @param c 32-bit code point
* @return true or false
* @stable ICU 2.4
*/
#define U_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
/**
* Is this code point a trail surrogate (U+dc00..U+dfff)?
* @param c 32-bit code point
* @return true or false
* @stable ICU 2.4
*/
#define U_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
/**
* Is this code point a surrogate (U+d800..U+dfff)?
* @param c 32-bit code point
* @return true or false
* @stable ICU 2.4
*/
#define U_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800)
/**
* Assuming c is a surrogate code point (U_IS_SURROGATE(c)),
* is it a lead surrogate?
* @param c 32-bit code point
* @return true or false
* @stable ICU 2.4
*/
#define U_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
/**
* Assuming c is a surrogate code point (U_IS_SURROGATE(c)),
* is it a trail surrogate?
* @param c 32-bit code point
* @return true or false
* @stable ICU 4.2
*/
#define U_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)
/* include the utfXX.h ------------------------------------------------------ */
#if !U_NO_DEFAULT_INCLUDE_UTF_HEADERS
#include "unicode/utf8.h"
#include "unicode/utf16.h"
/* utf_old.h contains deprecated, pre-ICU 2.4 definitions */
#include "unicode/utf_old.h"
#endif /* !U_NO_DEFAULT_INCLUDE_UTF_HEADERS */
#endif /* __UTF_H__ */

734
thirdparty/icu4c/common/unicode/utf16.h vendored Normal file
View File

@@ -0,0 +1,734 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 1999-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: utf16.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999sep09
* created by: Markus W. Scherer
*/
/**
* \file
* \brief C API: 16-bit Unicode handling macros
*
* This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
*
* For more information see utf.h and the ICU User Guide Strings chapter
* (https://unicode-org.github.io/icu/userguide/strings).
*
* <em>Usage:</em>
* ICU coding guidelines for if() statements should be followed when using these macros.
* Compound statements (curly braces {}) must be used for if-else-while...
* bodies and all macro statements should be terminated with semicolon.
*/
#ifndef __UTF16_H__
#define __UTF16_H__
#include <stdbool.h>
#include "unicode/umachine.h"
#ifndef __UTF_H__
# include "unicode/utf.h"
#endif
/* single-code point definitions -------------------------------------------- */
/**
* Does this code unit alone encode a code point (BMP, not a surrogate)?
* @param c 16-bit code unit
* @return true or false
* @stable ICU 2.4
*/
#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
/**
* Is this code unit a lead surrogate (U+d800..U+dbff)?
* @param c 16-bit code unit
* @return true or false
* @stable ICU 2.4
*/
#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
/**
* Is this code unit a trail surrogate (U+dc00..U+dfff)?
* @param c 16-bit code unit
* @return true or false
* @stable ICU 2.4
*/
#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
/**
* Is this code unit a surrogate (U+d800..U+dfff)?
* @param c 16-bit code unit
* @return true or false
* @stable ICU 2.4
*/
#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
/**
* Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
* is it a lead surrogate?
* @param c 16-bit code unit
* @return true or false
* @stable ICU 2.4
*/
#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
/**
* Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
* is it a trail surrogate?
* @param c 16-bit code unit
* @return true or false
* @stable ICU 4.2
*/
#define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)
/**
* Helper constant for U16_GET_SUPPLEMENTARY.
* @internal
*/
#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
/**
* Get a supplementary code point value (U+10000..U+10ffff)
* from its lead and trail surrogates.
* The result is undefined if the input values are not
* lead and trail surrogates.
*
* @param lead lead surrogate (U+d800..U+dbff)
* @param trail trail surrogate (U+dc00..U+dfff)
* @return supplementary code point (U+10000..U+10ffff)
* @stable ICU 2.4
*/
#define U16_GET_SUPPLEMENTARY(lead, trail) \
(((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
/**
* Get the lead surrogate (0xd800..0xdbff) for a
* supplementary code point (0x10000..0x10ffff).
* @param supplementary 32-bit code point (U+10000..U+10ffff)
* @return lead surrogate (U+d800..U+dbff) for supplementary
* @stable ICU 2.4
*/
#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
/**
* Get the trail surrogate (0xdc00..0xdfff) for a
* supplementary code point (0x10000..0x10ffff).
* @param supplementary 32-bit code point (U+10000..U+10ffff)
* @return trail surrogate (U+dc00..U+dfff) for supplementary
* @stable ICU 2.4
*/
#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
/**
* How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
* The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
* @param c 32-bit code point
* @return 1 or 2
* @stable ICU 2.4
*/
#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
/**
* The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
* @return 2
* @stable ICU 2.4
*/
#define U16_MAX_LENGTH 2
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* The offset may point to either the lead or trail surrogate unit
* for a supplementary code point, in which case the macro will read
* the adjacent matching surrogate as well.
* The result is undefined if the offset points to a single, unpaired surrogate.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
*
* @param s const UChar * string
* @param i string offset
* @param c output UChar32 variable
* @see U16_GET
* @stable ICU 2.4
*/
#define U16_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[i]; \
if(U16_IS_SURROGATE(c)) { \
if(U16_IS_SURROGATE_LEAD(c)) { \
(c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
} else { \
(c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The offset may point to either the lead or trail surrogate unit
* for a supplementary code point, in which case the macro will read
* the adjacent matching surrogate as well.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to a single, unpaired surrogate, then
* c is set to that unpaired surrogate.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<=i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_GET_UNSAFE
* @stable ICU 2.4
*/
#define U16_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[i]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_LEAD(c)) { \
if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} \
} else { \
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The offset may point to either the lead or trail surrogate unit
* for a supplementary code point, in which case the macro will read
* the adjacent matching surrogate as well.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to a single, unpaired surrogate, then
* c is set to U+FFFD.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT_OR_FFFD.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<=i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_GET_UNSAFE
* @stable ICU 60
*/
#define U16_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[i]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_LEAD(c)) { \
if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} else { \
(c)=0xfffd; \
} \
} else { \
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} else { \
(c)=0xfffd; \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/* definitions with forward iteration --------------------------------------- */
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate, then that itself
* will be returned as the code point.
* The result is undefined if the offset points to a single, unpaired lead surrogate.
*
* @param s const UChar * string
* @param i string offset
* @param c output UChar32 variable
* @see U16_NEXT
* @stable ICU 2.4
*/
#define U16_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[(i)++]; \
if(U16_IS_LEAD(c)) { \
(c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate or
* to a single, unpaired lead surrogate, then c is set to that unpaired surrogate.
*
* @param s const UChar * string
* @param i string offset, must be i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_NEXT_UNSAFE
* @stable ICU 2.4
*/
#define U16_NEXT(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[(i)++]; \
if(U16_IS_LEAD(c)) { \
uint16_t __c2; \
if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
++(i); \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate or
* to a single, unpaired lead surrogate, then c is set to U+FFFD.
*
* @param s const UChar * string
* @param i string offset, must be i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_NEXT_UNSAFE
* @stable ICU 60
*/
#define U16_NEXT_OR_FFFD(s, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[(i)++]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_LEAD(c) && (i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
++(i); \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} else { \
(c)=0xfffd; \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Append a code point to a string, overwriting 1 or 2 code units.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Unsafe" macro, assumes a valid code point and sufficient space in the string.
* Otherwise, the result is undefined.
*
* @param s const UChar * string buffer
* @param i string offset
* @param c code point to append
* @see U16_APPEND
* @stable ICU 2.4
*/
#define U16_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
if((uint32_t)(c)<=0xffff) { \
(s)[(i)++]=(uint16_t)(c); \
} else { \
(s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
(s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Append a code point to a string, overwriting 1 or 2 code units.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Safe" macro, checks for a valid code point.
* If a surrogate pair is written, checks for sufficient space in the string.
* If the code point is not valid or a trail surrogate does not fit,
* then isError is set to true.
*
* @param s const UChar * string buffer
* @param i string offset, must be i<capacity
* @param capacity size of the string buffer
* @param c code point to append
* @param isError output UBool set to true if an error occurs, otherwise not modified
* @see U16_APPEND_UNSAFE
* @stable ICU 2.4
*/
#define U16_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
if((uint32_t)(c)<=0xffff) { \
(s)[(i)++]=(uint16_t)(c); \
} else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
(s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
(s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
} else /* c>0x10ffff or not enough space */ { \
(isError)=true; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_FWD_1
* @stable ICU 2.4
*/
#define U16_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_LEAD((s)[(i)++])) { \
++(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const UChar * string
* @param i string offset, must be i<length
* @param length string length
* @see U16_FWD_1_UNSAFE
* @stable ICU 2.4
*/
#define U16_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_LEAD((s)[(i)++]) && (i)!=(length) && U16_IS_TRAIL((s)[i])) { \
++(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @param n number of code points to skip
* @see U16_FWD_N
* @stable ICU 2.4
*/
#define U16_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0) { \
U16_FWD_1_UNSAFE(s, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const UChar * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @param n number of code points to skip
* @see U16_FWD_N_UNSAFE
* @stable ICU 2.4
*/
#define U16_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
U16_FWD_1(s, i, length); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to the trail surrogate of a surrogate pair,
* then the offset is decremented.
* Otherwise, it is not modified.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_SET_CP_START
* @stable ICU 2.4
*/
#define U16_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_TRAIL((s)[i])) { \
--(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to the trail surrogate of a surrogate pair,
* then the offset is decremented.
* Otherwise, it is not modified.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<=i
* @see U16_SET_CP_START_UNSAFE
* @stable ICU 2.4
*/
#define U16_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
--(i); \
} \
} UPRV_BLOCK_MACRO_END
/* definitions with backward iteration -------------------------------------- */
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* The input offset may be the same as the string length.
* If the offset is behind a trail surrogate unit
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate, then that itself
* will be returned as the code point.
* The result is undefined if the offset is behind a single, unpaired trail surrogate.
*
* @param s const UChar * string
* @param i string offset
* @param c output UChar32 variable
* @see U16_PREV
* @stable ICU 2.4
*/
#define U16_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[--(i)]; \
if(U16_IS_TRAIL(c)) { \
(c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a trail surrogate unit
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate or behind a single, unpaired
* trail surrogate, then c is set to that unpaired surrogate.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<i
* @param c output UChar32 variable
* @see U16_PREV_UNSAFE
* @stable ICU 2.4
*/
#define U16_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[--(i)]; \
if(U16_IS_TRAIL(c)) { \
uint16_t __c2; \
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
--(i); \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a trail surrogate unit
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate or behind a single, unpaired
* trail surrogate, then c is set to U+FFFD.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<i
* @param c output UChar32 variable
* @see U16_PREV_UNSAFE
* @stable ICU 60
*/
#define U16_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(s)[--(i)]; \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(U16_IS_SURROGATE_TRAIL(c) && (i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
--(i); \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} else { \
(c)=0xfffd; \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_BACK_1
* @stable ICU 2.4
*/
#define U16_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_TRAIL((s)[--(i)])) { \
--(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, must be start<i
* @see U16_BACK_1_UNSAFE
* @stable ICU 2.4
*/
#define U16_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
--(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @param n number of code points to skip
* @see U16_BACK_N
* @stable ICU 2.4
*/
#define U16_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0) { \
U16_BACK_1_UNSAFE(s, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param start start of string
* @param i string offset, must be start<i
* @param n number of code points to skip
* @see U16_BACK_N_UNSAFE
* @stable ICU 2.4
*/
#define U16_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0 && (i)>(start)) { \
U16_BACK_1(s, start, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind the lead surrogate of a surrogate pair,
* then the offset is incremented.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_SET_CP_LIMIT
* @stable ICU 2.4
*/
#define U16_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U16_IS_LEAD((s)[(i)-1])) { \
++(i); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind the lead surrogate of a surrogate pair,
* then the offset is incremented.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const UChar * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, start<=i<=length
* @param length int32_t string length
* @see U16_SET_CP_LIMIT_UNSAFE
* @stable ICU 2.4
*/
#define U16_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
if((start)<(i) && ((i)<(length) || (length)<0) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
++(i); \
} \
} UPRV_BLOCK_MACRO_END
#endif

25
thirdparty/icu4c/common/unicode/utf32.h vendored Normal file
View File

@@ -0,0 +1,25 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 1999-2001, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: utf32.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999sep20
* created by: Markus W. Scherer
*/
/**
* \file
* \brief C API: UTF-32 macros
*
* This file is obsolete and its contents moved to utf_old.h.
* See utf_old.h and Jitterbug 2150 and its discussion on the ICU mailing list
* in September 2002.
*/

882
thirdparty/icu4c/common/unicode/utf8.h vendored Normal file
View File

@@ -0,0 +1,882 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 1999-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: utf8.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999sep13
* created by: Markus W. Scherer
*/
/**
* \file
* \brief C API: 8-bit Unicode handling macros
*
* This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.
*
* For more information see utf.h and the ICU User Guide Strings chapter
* (https://unicode-org.github.io/icu/userguide/strings).
*
* <em>Usage:</em>
* ICU coding guidelines for if() statements should be followed when using these macros.
* Compound statements (curly braces {}) must be used for if-else-while...
* bodies and all macro statements should be terminated with semicolon.
*/
#ifndef __UTF8_H__
#define __UTF8_H__
#include <stdbool.h>
#include "unicode/umachine.h"
#ifndef __UTF_H__
# include "unicode/utf.h"
#endif
/* internal definitions ----------------------------------------------------- */
/**
* Counts the trail bytes for a UTF-8 lead byte.
* Returns 0 for 0..0xc1 as well as for 0xf5..0xff.
* leadByte might be evaluated multiple times.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this file and thus must remain stable.
*
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
* @internal
*/
#define U8_COUNT_TRAIL_BYTES(leadByte) \
(U8_IS_LEAD(leadByte) ? \
((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0)
/**
* Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
* Returns 0 for 0..0xc1. Undefined for 0xf5..0xff.
* leadByte might be evaluated multiple times.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this file and thus must remain stable.
*
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
* @internal
*/
#define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
(((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))
/**
* Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this file and thus must remain stable.
* @internal
*/
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
/**
* Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
* Lead byte E0..EF bits 3..0 are used as byte index,
* first trail byte bits 7..5 are used as bit index into that byte.
* @see U8_IS_VALID_LEAD3_AND_T1
* @internal
*/
#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
/**
* Internal 3-byte UTF-8 validity check.
* Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
* @internal
*/
#define U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
/**
* Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
* Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
* First trail byte bits 7..4 are used as byte index,
* lead byte F0..F4 bits 2..0 are used as bit index into that byte.
* @see U8_IS_VALID_LEAD4_AND_T1
* @internal
*/
#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
/**
* Internal 4-byte UTF-8 validity check.
* Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
* @internal
*/
#define U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
/**
* Function for handling "next code point" with error-checking.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this
* file and thus must remain stable, and should not be hidden when other internal
* functions are hidden (otherwise public macros would fail to compile).
* @internal
*/
U_CAPI UChar32 U_EXPORT2
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, int8_t strict);
/**
* Function for handling "append code point" with error-checking.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this
* file and thus must remain stable, and should not be hidden when other internal
* functions are hidden (otherwise public macros would fail to compile).
* @internal
*/
U_CAPI int32_t U_EXPORT2
utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError);
/**
* Function for handling "previous code point" with error-checking.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this
* file and thus must remain stable, and should not be hidden when other internal
* functions are hidden (otherwise public macros would fail to compile).
* @internal
*/
U_CAPI UChar32 U_EXPORT2
utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, int8_t strict);
/**
* Function for handling "skip backward one code point" with error-checking.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this
* file and thus must remain stable, and should not be hidden when other internal
* functions are hidden (otherwise public macros would fail to compile).
* @internal
*/
U_CAPI int32_t U_EXPORT2
utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
/* single-code point definitions -------------------------------------------- */
/**
* Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
* @param c 8-bit code unit (byte)
* @return true or false
* @stable ICU 2.4
*/
#define U8_IS_SINGLE(c) (((c)&0x80)==0)
/**
* Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
* @param c 8-bit code unit (byte)
* @return true or false
* @stable ICU 2.4
*/
#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
// 0x32=0xf4-0xc2
/**
* Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
* @param c 8-bit code unit (byte)
* @return true or false
* @stable ICU 2.4
*/
#define U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
/**
* How many code units (bytes) are used for the UTF-8 encoding
* of this Unicode code point?
* @param c 32-bit code point
* @return 1..4, or 0 if c is a surrogate or not a Unicode code point
* @stable ICU 2.4
*/
#define U8_LENGTH(c) \
((uint32_t)(c)<=0x7f ? 1 : \
((uint32_t)(c)<=0x7ff ? 2 : \
((uint32_t)(c)<=0xd7ff ? 3 : \
((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
((uint32_t)(c)<=0xffff ? 3 : 4)\
) \
) \
) \
)
/**
* The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
* @return 4
* @stable ICU 2.4
*/
#define U8_MAX_LENGTH 4
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* The offset may point to either the lead byte or one of the trail bytes
* for a code point, in which case the macro will read all of the bytes
* for the code point.
* The result is undefined if the offset points to an illegal UTF-8
* byte sequence.
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
*
* @param s const uint8_t * string
* @param i string offset
* @param c output UChar32 variable
* @see U8_GET
* @stable ICU 2.4
*/
#define U8_GET_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
int32_t _u8_get_unsafe_index=(int32_t)(i); \
U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \
U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* The offset may point to either the lead byte or one of the trail bytes
* for a code point, in which case the macro will read all of the bytes
* for the code point.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to an illegal UTF-8 byte sequence, then
* c is set to a negative value.
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset
* @param i int32_t string offset, must be start<=i<length
* @param length int32_t string length
* @param c output UChar32 variable, set to <0 in case of an error
* @see U8_GET_UNSAFE
* @stable ICU 2.4
*/
#define U8_GET(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
int32_t _u8_get_index=(i); \
U8_SET_CP_START(s, start, _u8_get_index); \
U8_NEXT(s, _u8_get_index, length, c); \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* The offset may point to either the lead byte or one of the trail bytes
* for a code point, in which case the macro will read all of the bytes
* for the code point.
*
* The length can be negative for a NUL-terminated string.
*
* If the offset points to an illegal UTF-8 byte sequence, then
* c is set to U+FFFD.
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD.
*
* This macro does not distinguish between a real U+FFFD in the text
* and U+FFFD returned for an ill-formed sequence.
* Use U8_GET() if that distinction is important.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset
* @param i int32_t string offset, must be start<=i<length
* @param length int32_t string length
* @param c output UChar32 variable, set to U+FFFD in case of an error
* @see U8_GET
* @stable ICU 51
*/
#define U8_GET_OR_FFFD(s, start, i, length, c) UPRV_BLOCK_MACRO_BEGIN { \
int32_t _u8_get_index=(i); \
U8_SET_CP_START(s, start, _u8_get_index); \
U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \
} UPRV_BLOCK_MACRO_END
/* definitions with forward iteration --------------------------------------- */
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* The offset may point to the lead byte of a multi-byte sequence,
* in which case the macro will read the whole sequence.
* The result is undefined if the offset points to a trail byte
* or an illegal UTF-8 sequence.
*
* @param s const uint8_t * string
* @param i string offset
* @param c output UChar32 variable
* @see U8_NEXT
* @stable ICU 2.4
*/
#define U8_NEXT_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[(i)++]; \
if(!U8_IS_SINGLE(c)) { \
if((c)<0xe0) { \
(c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
} else if((c)<0xf0) { \
/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
(c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \
(i)+=2; \
} else { \
(c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \
(i)+=3; \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead byte of a multi-byte sequence,
* in which case the macro will read the whole sequence.
* If the offset points to a trail byte or an illegal UTF-8 sequence, then
* c is set to a negative value.
*
* @param s const uint8_t * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @param c output UChar32 variable, set to <0 in case of an error
* @see U8_NEXT_UNSAFE
* @stable ICU 2.4
*/
#define U8_NEXT(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, U_SENTINEL)
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* The offset may point to the lead byte of a multi-byte sequence,
* in which case the macro will read the whole sequence.
* If the offset points to a trail byte or an illegal UTF-8 sequence, then
* c is set to U+FFFD.
*
* This macro does not distinguish between a real U+FFFD in the text
* and U+FFFD returned for an ill-formed sequence.
* Use U8_NEXT() if that distinction is important.
*
* @param s const uint8_t * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @param c output UChar32 variable, set to U+FFFD in case of an error
* @see U8_NEXT
* @stable ICU 51
*/
#define U8_NEXT_OR_FFFD(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, 0xfffd)
/** @internal */
#define U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[(i)++]; \
if(!U8_IS_SINGLE(c)) { \
uint8_t __t = 0; \
if((i)!=(length) && \
/* fetch/validate/assemble all but last trail byte */ \
((c)>=0xe0 ? \
((c)<0xf0 ? /* U+0800..U+FFFF except surrogates */ \
U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \
(__t&=0x3f, 1) \
: /* U+10000..U+10FFFF */ \
((c)-=0xf0)<=4 && \
U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \
((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \
(__t=(s)[i]-0x80)<=0x3f) && \
/* valid second-to-last trail byte */ \
((c)=((c)<<6)|__t, ++(i)!=(length)) \
: /* U+0080..U+07FF */ \
(c)>=0xc2 && ((c)&=0x1f, 1)) && \
/* last trail byte */ \
(__t=(s)[i]-0x80)<=0x3f && \
((c)=((c)<<6)|__t, ++(i), 1)) { \
} else { \
(c)=(sub); /* ill-formed*/ \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Append a code point to a string, overwriting 1 to 4 bytes.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Unsafe" macro, assumes a valid code point and sufficient space in the string.
* Otherwise, the result is undefined.
*
* @param s const uint8_t * string buffer
* @param i string offset
* @param c code point to append
* @see U8_APPEND
* @stable ICU 2.4
*/
#define U8_APPEND_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
uint32_t __uc=(c); \
if(__uc<=0x7f) { \
(s)[(i)++]=(uint8_t)__uc; \
} else { \
if(__uc<=0x7ff) { \
(s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
} else { \
if(__uc<=0xffff) { \
(s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
} else { \
(s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
(s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
} \
(s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
} \
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Append a code point to a string, overwriting 1 to 4 bytes.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Safe" macro, checks for a valid code point.
* If a non-ASCII code point is written, checks for sufficient space in the string.
* If the code point is not valid or trail bytes do not fit,
* then isError is set to true.
*
* @param s const uint8_t * string buffer
* @param i int32_t string offset, must be i<capacity
* @param capacity int32_t size of the string buffer
* @param c UChar32 code point to append
* @param isError output UBool set to true if an error occurs, otherwise not modified
* @see U8_APPEND_UNSAFE
* @stable ICU 2.4
*/
#define U8_APPEND(s, i, capacity, c, isError) UPRV_BLOCK_MACRO_BEGIN { \
uint32_t __uc=(c); \
if(__uc<=0x7f) { \
(s)[(i)++]=(uint8_t)__uc; \
} else if(__uc<=0x7ff && (i)+1<(capacity)) { \
(s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
} else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \
(s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
(s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
} else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \
(s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
(s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
(s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
(s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
} else { \
(isError)=true; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @see U8_FWD_1
* @stable ICU 2.4
*/
#define U8_FWD_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
(i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const uint8_t * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @see U8_FWD_1_UNSAFE
* @stable ICU 2.4
*/
#define U8_FWD_1(s, i, length) UPRV_BLOCK_MACRO_BEGIN { \
uint8_t __b=(s)[(i)++]; \
if(U8_IS_LEAD(__b) && (i)!=(length)) { \
uint8_t __t1=(s)[i]; \
if((0xe0<=__b && __b<0xf0)) { \
if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
++(i); \
} \
} else if(__b<0xe0) { \
if(U8_IS_TRAIL(__t1)) { \
++(i); \
} \
} else /* c>=0xf0 */ { \
if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
++(i); \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @param n number of code points to skip
* @see U8_FWD_N
* @stable ICU 2.4
*/
#define U8_FWD_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0) { \
U8_FWD_1_UNSAFE(s, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const uint8_t * string
* @param i int32_t string offset, must be i<length
* @param length int32_t string length
* @param n number of code points to skip
* @see U8_FWD_N_UNSAFE
* @stable ICU 2.4
*/
#define U8_FWD_N(s, i, length, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
U8_FWD_1(s, i, length); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to a UTF-8 trail byte,
* then the offset is moved backward to the corresponding lead byte.
* Otherwise, it is not modified.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @see U8_SET_CP_START
* @stable ICU 2.4
*/
#define U8_SET_CP_START_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
while(U8_IS_TRAIL((s)[i])) { --(i); } \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to a UTF-8 trail byte,
* then the offset is moved backward to the corresponding lead byte.
* Otherwise, it is not modified.
*
* "Safe" macro, checks for illegal sequences and for string boundaries.
* Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i].
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<=i
* @see U8_SET_CP_START_UNSAFE
* @see U8_TRUNCATE_IF_INCOMPLETE
* @stable ICU 2.4
*/
#define U8_SET_CP_START(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U8_IS_TRAIL((s)[(i)])) { \
(i)=utf8_back1SafeBody(s, start, (i)); \
} \
} UPRV_BLOCK_MACRO_END
/**
* If the string ends with a UTF-8 byte sequence that is valid so far
* but incomplete, then reduce the length of the string to end before
* the lead byte of that incomplete sequence.
* For example, if the string ends with E1 80, the length is reduced by 2.
*
* In all other cases (the string ends with a complete sequence, or it is not
* possible for any further trail byte to extend the trailing sequence)
* the length remains unchanged.
*
* Useful for processing text split across multiple buffers
* (save the incomplete sequence for later)
* and for optimizing iteration
* (check for string length only once per character).
*
* "Safe" macro, checks for illegal sequences and for string boundaries.
* Unlike U8_SET_CP_START(), this macro never reads s[length].
*
* (In UTF-16, simply check for U16_IS_LEAD(last code unit).)
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param length int32_t string length (usually start<=length)
* @see U8_SET_CP_START
* @stable ICU 61
*/
#define U8_TRUNCATE_IF_INCOMPLETE(s, start, length) UPRV_BLOCK_MACRO_BEGIN { \
if((length)>(start)) { \
uint8_t __b1=s[(length)-1]; \
if(U8_IS_SINGLE(__b1)) { \
/* common ASCII character */ \
} else if(U8_IS_LEAD(__b1)) { \
--(length); \
} else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \
uint8_t __b2=s[(length)-2]; \
if(0xe0<=__b2 && __b2<=0xf4) { \
if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \
U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \
(length)-=2; \
} \
} else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \
uint8_t __b3=s[(length)-3]; \
if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \
(length)-=3; \
} \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/* definitions with backward iteration -------------------------------------- */
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* The input offset may be the same as the string length.
* If the offset is behind a multi-byte sequence, then the macro will read
* the whole sequence.
* If the offset is behind a lead byte, then that itself
* will be returned as the code point.
* The result is undefined if the offset is behind an illegal UTF-8 sequence.
*
* @param s const uint8_t * string
* @param i string offset
* @param c output UChar32 variable
* @see U8_PREV
* @stable ICU 2.4
*/
#define U8_PREV_UNSAFE(s, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[--(i)]; \
if(U8_IS_TRAIL(c)) { \
uint8_t __b, __count=1, __shift=6; \
\
/* c is a trail byte */ \
(c)&=0x3f; \
for(;;) { \
__b=(s)[--(i)]; \
if(__b>=0xc0) { \
U8_MASK_LEAD_BYTE(__b, __count); \
(c)|=(UChar32)__b<<__shift; \
break; \
} else { \
(c)|=(UChar32)(__b&0x3f)<<__shift; \
++__count; \
__shift+=6; \
} \
} \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a multi-byte sequence, then the macro will read
* the whole sequence.
* If the offset is behind a lead byte, then that itself
* will be returned as the code point.
* If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<i
* @param c output UChar32 variable, set to <0 in case of an error
* @see U8_PREV_UNSAFE
* @stable ICU 2.4
*/
#define U8_PREV(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[--(i)]; \
if(!U8_IS_SINGLE(c)) { \
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a multi-byte sequence, then the macro will read
* the whole sequence.
* If the offset is behind a lead byte, then that itself
* will be returned as the code point.
* If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD.
*
* This macro does not distinguish between a real U+FFFD in the text
* and U+FFFD returned for an ill-formed sequence.
* Use U8_PREV() if that distinction is important.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<i
* @param c output UChar32 variable, set to U+FFFD in case of an error
* @see U8_PREV
* @stable ICU 51
*/
#define U8_PREV_OR_FFFD(s, start, i, c) UPRV_BLOCK_MACRO_BEGIN { \
(c)=(uint8_t)(s)[--(i)]; \
if(!U8_IS_SINGLE(c)) { \
(c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @see U8_BACK_1
* @stable ICU 2.4
*/
#define U8_BACK_1_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
while(U8_IS_TRAIL((s)[--(i)])) {} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<i
* @see U8_BACK_1_UNSAFE
* @stable ICU 2.4
*/
#define U8_BACK_1(s, start, i) UPRV_BLOCK_MACRO_BEGIN { \
if(U8_IS_TRAIL((s)[--(i)])) { \
(i)=utf8_back1SafeBody(s, start, (i)); \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @param n number of code points to skip
* @see U8_BACK_N
* @stable ICU 2.4
*/
#define U8_BACK_N_UNSAFE(s, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0) { \
U8_BACK_1_UNSAFE(s, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* @param s const uint8_t * string
* @param start int32_t index of the start of the string
* @param i int32_t string offset, must be start<i
* @param n number of code points to skip
* @see U8_BACK_N_UNSAFE
* @stable ICU 2.4
*/
#define U8_BACK_N(s, start, i, n) UPRV_BLOCK_MACRO_BEGIN { \
int32_t __N=(n); \
while(__N>0 && (i)>(start)) { \
U8_BACK_1(s, start, i); \
--__N; \
} \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind a partial multi-byte sequence,
* then the offset is incremented to behind the whole sequence.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @param i string offset
* @see U8_SET_CP_LIMIT
* @stable ICU 2.4
*/
#define U8_SET_CP_LIMIT_UNSAFE(s, i) UPRV_BLOCK_MACRO_BEGIN { \
U8_BACK_1_UNSAFE(s, i); \
U8_FWD_1_UNSAFE(s, i); \
} UPRV_BLOCK_MACRO_END
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind a partial multi-byte sequence,
* then the offset is incremented to behind the whole sequence.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The length can be negative for a NUL-terminated string.
*
* @param s const uint8_t * string
* @param start int32_t starting string offset (usually 0)
* @param i int32_t string offset, must be start<=i<=length
* @param length int32_t string length
* @see U8_SET_CP_LIMIT_UNSAFE
* @stable ICU 2.4
*/
#define U8_SET_CP_LIMIT(s, start, i, length) UPRV_BLOCK_MACRO_BEGIN { \
if((start)<(i) && ((i)<(length) || (length)<0)) { \
U8_BACK_1(s, start, i); \
U8_FWD_1(s, i, length); \
} \
} UPRV_BLOCK_MACRO_END
#endif

1201
thirdparty/icu4c/common/unicode/utf_old.h vendored Normal file

File diff suppressed because it is too large Load Diff

506
thirdparty/icu4c/common/unicode/utrace.h vendored Normal file
View File

@@ -0,0 +1,506 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
*
* Copyright (C) 2003-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: utrace.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003aug06
* created by: Markus W. Scherer
*
* Definitions for ICU tracing/logging.
*
*/
#ifndef __UTRACE_H__
#define __UTRACE_H__
#include <stdarg.h>
#include "unicode/utypes.h"
/**
* \file
* \brief C API: Definitions for ICU tracing/logging.
*
* This provides API for debugging the internals of ICU without the use of
* a traditional debugger.
*
* By default, tracing is disabled in ICU. If you need to debug ICU with
* tracing, please compile ICU with the --enable-tracing configure option.
*/
U_CDECL_BEGIN
/**
* Trace severity levels. Higher levels increase the verbosity of the trace output.
* @see utrace_setLevel
* @stable ICU 2.8
*/
typedef enum UTraceLevel {
/** Disable all tracing @stable ICU 2.8*/
UTRACE_OFF=-1,
/** Trace error conditions only @stable ICU 2.8*/
UTRACE_ERROR=0,
/** Trace errors and warnings @stable ICU 2.8*/
UTRACE_WARNING=3,
/** Trace opens and closes of ICU services @stable ICU 2.8*/
UTRACE_OPEN_CLOSE=5,
/** Trace an intermediate number of ICU operations @stable ICU 2.8*/
UTRACE_INFO=7,
/** Trace the maximum number of ICU operations @stable ICU 2.8*/
UTRACE_VERBOSE=9
} UTraceLevel;
/**
* These are the ICU functions that will be traced when tracing is enabled.
* @stable ICU 2.8
*/
typedef enum UTraceFunctionNumber {
UTRACE_FUNCTION_START=0,
UTRACE_U_INIT=UTRACE_FUNCTION_START,
UTRACE_U_CLEANUP,
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the highest normal collation trace location.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
UTRACE_FUNCTION_LIMIT,
#endif // U_HIDE_DEPRECATED_API
UTRACE_CONVERSION_START=0x1000,
UTRACE_UCNV_OPEN=UTRACE_CONVERSION_START,
UTRACE_UCNV_OPEN_PACKAGE,
UTRACE_UCNV_OPEN_ALGORITHMIC,
UTRACE_UCNV_CLONE,
UTRACE_UCNV_CLOSE,
UTRACE_UCNV_FLUSH_CACHE,
UTRACE_UCNV_LOAD,
UTRACE_UCNV_UNLOAD,
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the highest normal collation trace location.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
UTRACE_CONVERSION_LIMIT,
#endif // U_HIDE_DEPRECATED_API
UTRACE_COLLATION_START=0x2000,
UTRACE_UCOL_OPEN=UTRACE_COLLATION_START,
UTRACE_UCOL_CLOSE,
UTRACE_UCOL_STRCOLL,
UTRACE_UCOL_GET_SORTKEY,
UTRACE_UCOL_GETLOCALE,
UTRACE_UCOL_NEXTSORTKEYPART,
UTRACE_UCOL_STRCOLLITER,
UTRACE_UCOL_OPEN_FROM_SHORT_STRING,
UTRACE_UCOL_STRCOLLUTF8, /**< @stable ICU 50 */
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the highest normal collation trace location.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
UTRACE_COLLATION_LIMIT,
#endif // U_HIDE_DEPRECATED_API
/**
* The lowest resource/data location.
* @stable ICU 65
*/
UTRACE_UDATA_START=0x3000,
/**
* Indicates that a value was read from a resource bundle. Provides three
* C-style strings to UTraceData: type, file name, and resource path. The
* possible types are:
*
* - "string" (a string value was accessed)
* - "binary" (a binary value was accessed)
* - "intvector" (a integer vector value was accessed)
* - "int" (a signed integer value was accessed)
* - "uint" (a unsigned integer value was accessed)
* - "get" (a path was loaded, but the value was not accessed)
* - "getalias" (a path was loaded, and an alias was resolved)
*
* @stable ICU 65
*/
UTRACE_UDATA_RESOURCE=UTRACE_UDATA_START,
/**
* Indicates that a resource bundle was opened.
*
* Provides one C-style string to UTraceData: file name.
* @stable ICU 65
*/
UTRACE_UDATA_BUNDLE,
/**
* Indicates that a data file was opened, but not *.res files.
*
* Provides one C-style string to UTraceData: file name.
*
* @stable ICU 65
*/
UTRACE_UDATA_DATA_FILE,
/**
* Indicates that a *.res file was opened.
*
* This differs from UTRACE_UDATA_BUNDLE because a res file is typically
* opened only once per application runtime, but the bundle corresponding
* to that res file may be opened many times.
*
* Provides one C-style string to UTraceData: file name.
*
* @stable ICU 65
*/
UTRACE_UDATA_RES_FILE,
#ifndef U_HIDE_INTERNAL_API
/**
* One more than the highest normal resource/data trace location.
* @internal The numeric value may change over time, see ICU ticket #12420.
*/
UTRACE_RES_DATA_LIMIT,
#endif // U_HIDE_INTERNAL_API
/**
* The lowest break iterator location.
* @stable ICU 67
*/
UTRACE_UBRK_START=0x4000,
/**
* Indicates that a character instance of break iterator was created.
*
* @stable ICU 67
*/
UTRACE_UBRK_CREATE_CHARACTER = UTRACE_UBRK_START,
/**
* Indicates that a word instance of break iterator was created.
*
* @stable ICU 67
*/
UTRACE_UBRK_CREATE_WORD,
/**
* Indicates that a line instance of break iterator was created.
*
* Provides one C-style string to UTraceData: the lb value ("",
* "loose", "strict", or "normal").
*
* @stable ICU 67
*/
UTRACE_UBRK_CREATE_LINE,
/**
* Indicates that a sentence instance of break iterator was created.
*
* @stable ICU 67
*/
UTRACE_UBRK_CREATE_SENTENCE,
/**
* Indicates that a title instance of break iterator was created.
*
* @stable ICU 67
*/
UTRACE_UBRK_CREATE_TITLE,
/**
* Indicates that an internal dictionary break engine was created.
*
* Provides one C-style string to UTraceData: the script code of what
* the break engine cover ("Hani", "Khmr", "Laoo", "Mymr", or "Thai").
*
* @stable ICU 67
*/
UTRACE_UBRK_CREATE_BREAK_ENGINE,
#ifndef U_HIDE_INTERNAL_API
/**
* One more than the highest normal break iterator trace location.
* @internal The numeric value may change over time, see ICU ticket #12420.
*/
UTRACE_UBRK_LIMIT,
#endif // U_HIDE_INTERNAL_API
} UTraceFunctionNumber;
/**
* Setter for the trace level.
* @param traceLevel A UTraceLevel value.
* @stable ICU 2.8
*/
U_CAPI void U_EXPORT2
utrace_setLevel(int32_t traceLevel);
/**
* Getter for the trace level.
* @return The UTraceLevel value being used by ICU.
* @stable ICU 2.8
*/
U_CAPI int32_t U_EXPORT2
utrace_getLevel(void);
/* Trace function pointers types ----------------------------- */
/**
* Type signature for the trace function to be called when entering a function.
* @param context value supplied at the time the trace functions are set.
* @param fnNumber Enum value indicating the ICU function being entered.
* @stable ICU 2.8
*/
typedef void U_CALLCONV
UTraceEntry(const void *context, int32_t fnNumber);
/**
* Type signature for the trace function to be called when exiting from a function.
* @param context value supplied at the time the trace functions are set.
* @param fnNumber Enum value indicating the ICU function being exited.
* @param fmt A formatting string that describes the number and types
* of arguments included with the variable args. The fmt
* string has the same form as the utrace_vformat format
* string.
* @param args A variable arguments list. Contents are described by
* the fmt parameter.
* @see utrace_vformat
* @stable ICU 2.8
*/
typedef void U_CALLCONV
UTraceExit(const void *context, int32_t fnNumber,
const char *fmt, va_list args);
/**
* Type signature for the trace function to be called from within an ICU function
* to display data or messages.
* @param context value supplied at the time the trace functions are set.
* @param fnNumber Enum value indicating the ICU function being exited.
* @param level The current tracing level
* @param fmt A format string describing the tracing data that is supplied
* as variable args
* @param args The data being traced, passed as variable args.
* @stable ICU 2.8
*/
typedef void U_CALLCONV
UTraceData(const void *context, int32_t fnNumber, int32_t level,
const char *fmt, va_list args);
/**
* Set ICU Tracing functions. Installs application-provided tracing
* functions into ICU. After doing this, subsequent ICU operations
* will call back to the installed functions, providing a trace
* of the use of ICU. Passing a NULL pointer for a tracing function
* is allowed, and inhibits tracing action at points where that function
* would be called.
* <p>
* Tracing and Threads: Tracing functions are global to a process, and
* will be called in response to ICU operations performed by any
* thread. If tracing of an individual thread is desired, the
* tracing functions must themselves filter by checking that the
* current thread is the desired thread.
*
* @param context an uninterpreted pointer. Whatever is passed in
* here will in turn be passed to each of the tracing
* functions UTraceEntry, UTraceExit and UTraceData.
* ICU does not use or alter this pointer.
* @param e Callback function to be called on entry to a
* a traced ICU function.
* @param x Callback function to be called on exit from a
* traced ICU function.
* @param d Callback function to be called from within a
* traced ICU function, for the purpose of providing
* data to the trace.
*
* @stable ICU 2.8
*/
U_CAPI void U_EXPORT2
utrace_setFunctions(const void *context,
UTraceEntry *e, UTraceExit *x, UTraceData *d);
/**
* Get the currently installed ICU tracing functions. Note that a null function
* pointer will be returned if no trace function has been set.
*
* @param context The currently installed tracing context.
* @param e The currently installed UTraceEntry function.
* @param x The currently installed UTraceExit function.
* @param d The currently installed UTraceData function.
* @stable ICU 2.8
*/
U_CAPI void U_EXPORT2
utrace_getFunctions(const void **context,
UTraceEntry **e, UTraceExit **x, UTraceData **d);
/*
*
* ICU trace format string syntax
*
* Format Strings are passed to UTraceData functions, and define the
* number and types of the trace data being passed on each call.
*
* The UTraceData function, which is supplied by the application,
* not by ICU, can either forward the trace data (passed via
* varargs) and the format string back to ICU for formatting into
* a displayable string, or it can interpret the format itself,
* and do as it wishes with the trace data.
*
*
* Goals for the format string
* - basic data output
* - easy to use for trace programmer
* - sufficient provision for data types for trace output readability
* - well-defined types and binary portable APIs
*
* Non-goals
* - printf compatibility
* - fancy formatting
* - argument reordering and other internationalization features
*
* ICU trace format strings contain plain text with argument inserts,
* much like standard printf format strings.
* Each insert begins with a '%', then optionally contains a 'v',
* then exactly one type character.
* Two '%' in a row represent a '%' instead of an insert.
* The trace format strings need not have \n at the end.
*
*
* Types
* -----
*
* Type characters:
* - c A char character in the default codepage.
* - s A NUL-terminated char * string in the default codepage.
* - S A UChar * string. Requires two params, (ptr, length). Length=-1 for nul term.
* - b A byte (8-bit integer).
* - h A 16-bit integer. Also a 16 bit Unicode code unit.
* - d A 32-bit integer. Also a 20 bit Unicode code point value.
* - l A 64-bit integer.
* - p A data pointer.
*
* Vectors
* -------
*
* If the 'v' is not specified, then one item of the specified type
* is passed in.
* If the 'v' (for "vector") is specified, then a vector of items of the
* specified type is passed in, via a pointer to the first item
* and an int32_t value for the length of the vector.
* Length==-1 means zero or NUL termination. Works for vectors of all types.
*
* Note: %vS is a vector of (UChar *) strings. The strings must
* be nul terminated as there is no way to provide a
* separate length parameter for each string. The length
* parameter (required for all vectors) is the number of
* strings, not the length of the strings.
*
* Examples
* --------
*
* These examples show the parameters that will be passed to an application's
* UTraceData() function for various formats.
*
* - the precise formatting is up to the application!
* - the examples use type casts for arguments only to _show_ the types of
* arguments without needing variable declarations in the examples;
* the type casts will not be necessary in actual code
*
* UTraceDataFunc(context, fnNumber, level,
* "There is a character %c in the string %s.", // Format String
* (char)c, (const char *)s); // varargs parameters
* -> There is a character 0x42 'B' in the string "Bravo".
*
* UTraceDataFunc(context, fnNumber, level,
* "Vector of bytes %vb vector of chars %vc",
* (const uint8_t *)bytes, (int32_t)bytesLength,
* (const char *)chars, (int32_t)charsLength);
* -> Vector of bytes
* 42 63 64 3f [4]
* vector of chars
* "Bcd?"[4]
*
* UTraceDataFunc(context, fnNumber, level,
* "An int32_t %d and a whole bunch of them %vd",
* (int32_t)-5, (const int32_t *)ints, (int32_t)intsLength);
* -> An int32_t 0xfffffffb and a whole bunch of them
* fffffffb 00000005 0000010a [3]
*
*/
/**
* Trace output Formatter. An application's UTraceData tracing functions may call
* back to this function to format the trace output in a
* human readable form. Note that a UTraceData function may choose
* to not format the data; it could, for example, save it in
* in the raw form it was received (more compact), leaving
* formatting for a later trace analysis tool.
* @param outBuf pointer to a buffer to receive the formatted output. Output
* will be nul terminated if there is space in the buffer -
* if the length of the requested output < the output buffer size.
* @param capacity Length of the output buffer.
* @param indent Number of spaces to indent the output. Intended to allow
* data displayed from nested functions to be indented for readability.
* @param fmt Format specification for the data to output
* @param args Data to be formatted.
* @return Length of formatted output, including the terminating NUL.
* If buffer capacity is insufficient, the required capacity is returned.
* @stable ICU 2.8
*/
U_CAPI int32_t U_EXPORT2
utrace_vformat(char *outBuf, int32_t capacity,
int32_t indent, const char *fmt, va_list args);
/**
* Trace output Formatter. An application's UTraceData tracing functions may call
* this function to format any additional trace data, beyond that
* provided by default, in human readable form with the same
* formatting conventions used by utrace_vformat().
* @param outBuf pointer to a buffer to receive the formatted output. Output
* will be nul terminated if there is space in the buffer -
* if the length of the requested output < the output buffer size.
* @param capacity Length of the output buffer.
* @param indent Number of spaces to indent the output. Intended to allow
* data displayed from nested functions to be indented for readability.
* @param fmt Format specification for the data to output
* @param ... Data to be formatted.
* @return Length of formatted output, including the terminating NUL.
* If buffer capacity is insufficient, the required capacity is returned.
* @stable ICU 2.8
*/
U_CAPI int32_t U_EXPORT2
utrace_format(char *outBuf, int32_t capacity,
int32_t indent, const char *fmt, ...);
/* Trace function numbers --------------------------------------------------- */
/**
* Get the name of a function from its trace function number.
*
* @param fnNumber The trace number for an ICU function.
* @return The name string for the function.
*
* @see UTraceFunctionNumber
* @stable ICU 2.8
*/
U_CAPI const char * U_EXPORT2
utrace_functionName(int32_t fnNumber);
U_CDECL_END
#endif

761
thirdparty/icu4c/common/unicode/utypes.h vendored Normal file
View File

@@ -0,0 +1,761 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (C) 1996-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
* FILE NAME : UTYPES.H (formerly ptypes.h)
*
* Date Name Description
* 12/11/96 helena Creation.
* 02/27/97 aliu Added typedefs for UClassID, int8, int16, int32,
* uint8, uint16, and uint32.
* 04/01/97 aliu Added XP_CPLUSPLUS and modified to work under C as
* well as C++.
* Modified to use memcpy() for uprv_arrayCopy() fns.
* 04/14/97 aliu Added TPlatformUtilities.
* 05/07/97 aliu Added import/export specifiers (replacing the old
* broken EXT_CLASS). Added version number for our
* code. Cleaned up header.
* 6/20/97 helena Java class name change.
* 08/11/98 stephen UErrorCode changed from typedef to enum
* 08/12/98 erm Changed T_ANALYTIC_PACKAGE_VERSION to 3
* 08/14/98 stephen Added uprv_arrayCopy() for int8_t, int16_t, int32_t
* 12/09/98 jfitz Added BUFFER_OVERFLOW_ERROR (bug 1100066)
* 04/20/99 stephen Cleaned up & reworked for autoconf.
* Renamed to utypes.h.
* 05/05/99 stephen Changed to use <inttypes.h>
* 12/07/99 helena Moved copyright notice string from ucnv_bld.h here.
*******************************************************************************
*/
#ifndef UTYPES_H
#define UTYPES_H
#include "unicode/umachine.h"
#include "unicode/uversion.h"
#include "unicode/uconfig.h"
#include <float.h>
#if !U_NO_DEFAULT_INCLUDE_UTF_HEADERS
# include "unicode/utf.h"
#endif
/*!
* \file
* \brief Basic definitions for ICU, for both C and C++ APIs
*
* This file defines basic types, constants, and enumerations directly or
* indirectly by including other header files, especially utf.h for the
* basic character and string definitions and umachine.h for consistent
* integer and other types.
*/
/** @{ API visibility control */
/**
* \def U_SHOW_CPLUSPLUS_API
* When defined to 1 (=default) and compiled with a C++ compiler, both C and C++ APIs are visible.
* Otherwise, only C APIs are visible; this is for C++ users who want to
* restrict their usage to binary stable C APIs exported by ICU DLLs.
* @internal
*/
/**
* \def U_SHOW_CPLUSPLUS_HEADER_API
* When defined to 1 (=default) and compiled with a C++ compiler, C++ header-only APIs are visible.
* This is for C++ users who restrict their usage to binary stable C APIs exported by ICU DLLs
* (U_SHOW_CPLUSPLUS_API=0)
* but who still want to use C++ header-only APIs which do not rely on ICU DLL exports.
* @internal
*/
#ifdef __cplusplus
# ifndef U_SHOW_CPLUSPLUS_API
# define U_SHOW_CPLUSPLUS_API 1
# endif
# ifndef U_SHOW_CPLUSPLUS_HEADER_API
# define U_SHOW_CPLUSPLUS_HEADER_API 1
# endif
#else
# undef U_SHOW_CPLUSPLUS_API
# define U_SHOW_CPLUSPLUS_API 0
# undef U_SHOW_CPLUSPLUS_HEADER_API
# define U_SHOW_CPLUSPLUS_HEADER_API 0
#endif
/**
* \def U_HIDE_DRAFT_API
* Define this to 1 to request that draft API be "hidden"
* @internal
*/
/**
* \def U_HIDE_INTERNAL_API
* Define this to 1 to request that internal API be "hidden"
* @internal
*/
#if !U_DEFAULT_SHOW_DRAFT && !defined(U_SHOW_DRAFT_API)
#define U_HIDE_DRAFT_API 1
#endif
#if !U_DEFAULT_SHOW_DRAFT && !defined(U_SHOW_INTERNAL_API)
#define U_HIDE_INTERNAL_API 1
#endif
/** @} */
/*===========================================================================*/
/* ICUDATA naming scheme */
/*===========================================================================*/
/**
* \def U_ICUDATA_TYPE_LETTER
*
* This is a platform-dependent string containing one letter:
* - b for big-endian, ASCII-family platforms
* - l for little-endian, ASCII-family platforms
* - e for big-endian, EBCDIC-family platforms
* This letter is part of the common data file name.
* @stable ICU 2.0
*/
/**
* \def U_ICUDATA_TYPE_LITLETTER
* The non-string form of U_ICUDATA_TYPE_LETTER
* @stable ICU 2.0
*/
#if U_CHARSET_FAMILY
# if U_IS_BIG_ENDIAN
/* EBCDIC - should always be BE */
# define U_ICUDATA_TYPE_LETTER "e"
# define U_ICUDATA_TYPE_LITLETTER e
# else
# error "Don't know what to do with little endian EBCDIC!"
# define U_ICUDATA_TYPE_LETTER "x"
# define U_ICUDATA_TYPE_LITLETTER x
# endif
#else
# if U_IS_BIG_ENDIAN
/* Big-endian ASCII */
# define U_ICUDATA_TYPE_LETTER "b"
# define U_ICUDATA_TYPE_LITLETTER b
# else
/* Little-endian ASCII */
# define U_ICUDATA_TYPE_LETTER "l"
# define U_ICUDATA_TYPE_LITLETTER l
# endif
#endif
/**
* A single string literal containing the icudata stub name. i.e. 'icudt18e' for
* ICU 1.8.x on EBCDIC, etc..
* @stable ICU 2.0
*/
#define U_ICUDATA_NAME "icudt" U_ICU_VERSION_SHORT U_ICUDATA_TYPE_LETTER
#ifndef U_HIDE_INTERNAL_API
#define U_USRDATA_NAME "usrdt" U_ICU_VERSION_SHORT U_ICUDATA_TYPE_LETTER /**< @internal */
#define U_USE_USRDATA 0 /**< @internal */
#endif /* U_HIDE_INTERNAL_API */
/**
* U_ICU_ENTRY_POINT is the name of the DLL entry point to the ICU data library.
* Defined as a literal, not a string.
* Tricky Preprocessor use - ## operator replaces macro parameters with the literal string
* from the corresponding macro invocation, _before_ other macro substitutions.
* Need a nested \#defines to get the actual version numbers rather than
* the literal text U_ICU_VERSION_MAJOR_NUM into the name.
* The net result will be something of the form
* \#define U_ICU_ENTRY_POINT icudt19_dat
* @stable ICU 2.4
*/
#define U_ICUDATA_ENTRY_POINT U_DEF2_ICUDATA_ENTRY_POINT(U_ICU_VERSION_MAJOR_NUM,U_LIB_SUFFIX_C_NAME)
#ifndef U_HIDE_INTERNAL_API
/**
* Do not use. Note that it's OK for the 2nd argument to be undefined (literal).
* @internal
*/
#define U_DEF2_ICUDATA_ENTRY_POINT(major,suff) U_DEF_ICUDATA_ENTRY_POINT(major,suff)
/**
* Do not use.
* @internal
*/
#ifndef U_DEF_ICUDATA_ENTRY_POINT
/* affected by symbol renaming. See platform.h */
#ifndef U_LIB_SUFFIX_C_NAME
#define U_DEF_ICUDATA_ENTRY_POINT(major, suff) icudt##major##_dat
#else
#define U_DEF_ICUDATA_ENTRY_POINT(major, suff) icudt##suff ## major##_dat
#endif
#endif
#endif /* U_HIDE_INTERNAL_API */
/**
* \def NULL
* Define NULL if necessary, to nullptr for C++ and to ((void *)0) for C.
* @stable ICU 2.0
*/
#ifndef NULL
#ifdef __cplusplus
#define NULL nullptr
#else
#define NULL ((void *)0)
#endif
#endif
/*===========================================================================*/
/* Calendar/TimeZone data types */
/*===========================================================================*/
/**
* Date and Time data type.
* This is a primitive data type that holds the date and time
* as the number of milliseconds since 1970-jan-01, 00:00 UTC.
* UTC leap seconds are ignored.
* @stable ICU 2.0
*/
typedef double UDate;
/** The number of milliseconds per second @stable ICU 2.0 */
#define U_MILLIS_PER_SECOND (1000)
/** The number of milliseconds per minute @stable ICU 2.0 */
#define U_MILLIS_PER_MINUTE (60000)
/** The number of milliseconds per hour @stable ICU 2.0 */
#define U_MILLIS_PER_HOUR (3600000)
/** The number of milliseconds per day @stable ICU 2.0 */
#define U_MILLIS_PER_DAY (86400000)
/**
* Maximum UDate value
* @stable ICU 4.8
*/
#define U_DATE_MAX DBL_MAX
/**
* Minimum UDate value
* @stable ICU 4.8
*/
#define U_DATE_MIN -U_DATE_MAX
/*===========================================================================*/
/* Shared library/DLL import-export API control */
/*===========================================================================*/
/*
* Control of symbol import/export.
* ICU is separated into three libraries.
*/
/**
* \def U_COMBINED_IMPLEMENTATION
* Set to export library symbols from inside the ICU library
* when all of ICU is in a single library.
* This can be set as a compiler option while building ICU, and it
* needs to be the first one tested to override U_COMMON_API, U_I18N_API, etc.
* @stable ICU 2.0
*/
/**
* \def U_DATA_API
* Set to export library symbols from inside the stubdata library,
* and to import them from outside.
* @stable ICU 3.0
*/
/**
* \def U_COMMON_API
* Set to export library symbols from inside the common library,
* and to import them from outside.
* @stable ICU 2.0
*/
/**
* \def U_I18N_API
* Set to export library symbols from inside the i18n library,
* and to import them from outside.
* @stable ICU 2.0
*/
/**
* \def U_LAYOUT_API
* Set to export library symbols from inside the layout engine library,
* and to import them from outside.
* @stable ICU 2.0
*/
/**
* \def U_LAYOUTEX_API
* Set to export library symbols from inside the layout extensions library,
* and to import them from outside.
* @stable ICU 2.6
*/
/**
* \def U_IO_API
* Set to export library symbols from inside the ustdio library,
* and to import them from outside.
* @stable ICU 2.0
*/
/**
* \def U_TOOLUTIL_API
* Set to export library symbols from inside the toolutil library,
* and to import them from outside.
* @stable ICU 3.4
*/
#ifdef U_IN_DOXYGEN
// This definition is required when generating the API docs.
#define U_COMBINED_IMPLEMENTATION 1
#endif
#if defined(U_COMBINED_IMPLEMENTATION)
#define U_DATA_API U_EXPORT
#define U_COMMON_API U_EXPORT
#define U_I18N_API U_EXPORT
#define U_LAYOUT_API U_EXPORT
#define U_LAYOUTEX_API U_EXPORT
#define U_IO_API U_EXPORT
#define U_TOOLUTIL_API U_EXPORT
#elif defined(U_STATIC_IMPLEMENTATION)
#define U_DATA_API
#define U_COMMON_API
#define U_I18N_API
#define U_LAYOUT_API
#define U_LAYOUTEX_API
#define U_IO_API
#define U_TOOLUTIL_API
#elif defined(U_COMMON_IMPLEMENTATION)
#define U_DATA_API U_IMPORT
#define U_COMMON_API U_EXPORT
#define U_I18N_API U_IMPORT
#define U_LAYOUT_API U_IMPORT
#define U_LAYOUTEX_API U_IMPORT
#define U_IO_API U_IMPORT
#define U_TOOLUTIL_API U_IMPORT
#elif defined(U_I18N_IMPLEMENTATION)
#define U_DATA_API U_IMPORT
#define U_COMMON_API U_IMPORT
#define U_I18N_API U_EXPORT
#define U_LAYOUT_API U_IMPORT
#define U_LAYOUTEX_API U_IMPORT
#define U_IO_API U_IMPORT
#define U_TOOLUTIL_API U_IMPORT
#elif defined(U_LAYOUT_IMPLEMENTATION)
#define U_DATA_API U_IMPORT
#define U_COMMON_API U_IMPORT
#define U_I18N_API U_IMPORT
#define U_LAYOUT_API U_EXPORT
#define U_LAYOUTEX_API U_IMPORT
#define U_IO_API U_IMPORT
#define U_TOOLUTIL_API U_IMPORT
#elif defined(U_LAYOUTEX_IMPLEMENTATION)
#define U_DATA_API U_IMPORT
#define U_COMMON_API U_IMPORT
#define U_I18N_API U_IMPORT
#define U_LAYOUT_API U_IMPORT
#define U_LAYOUTEX_API U_EXPORT
#define U_IO_API U_IMPORT
#define U_TOOLUTIL_API U_IMPORT
#elif defined(U_IO_IMPLEMENTATION)
#define U_DATA_API U_IMPORT
#define U_COMMON_API U_IMPORT
#define U_I18N_API U_IMPORT
#define U_LAYOUT_API U_IMPORT
#define U_LAYOUTEX_API U_IMPORT
#define U_IO_API U_EXPORT
#define U_TOOLUTIL_API U_IMPORT
#elif defined(U_TOOLUTIL_IMPLEMENTATION)
#define U_DATA_API U_IMPORT
#define U_COMMON_API U_IMPORT
#define U_I18N_API U_IMPORT
#define U_LAYOUT_API U_IMPORT
#define U_LAYOUTEX_API U_IMPORT
#define U_IO_API U_IMPORT
#define U_TOOLUTIL_API U_EXPORT
#else
#define U_DATA_API U_IMPORT
#define U_COMMON_API U_IMPORT
#define U_I18N_API U_IMPORT
#define U_LAYOUT_API U_IMPORT
#define U_LAYOUTEX_API U_IMPORT
#define U_IO_API U_IMPORT
#define U_TOOLUTIL_API U_IMPORT
#endif
/**
* \def U_STANDARD_CPP_NAMESPACE
* Control of C++ Namespace
* @stable ICU 2.0
*/
#ifdef __cplusplus
#define U_STANDARD_CPP_NAMESPACE ::
#else
#define U_STANDARD_CPP_NAMESPACE
#endif
/*===========================================================================*/
/* UErrorCode */
/*===========================================================================*/
/**
* Standard ICU4C error code type, a substitute for exceptions.
*
* Initialize the UErrorCode with U_ZERO_ERROR, and check for success or
* failure using U_SUCCESS() or U_FAILURE():
*
* UErrorCode errorCode = U_ZERO_ERROR;
* // call ICU API that needs an error code parameter.
* if (U_FAILURE(errorCode)) {
* // An error occurred. Handle it here.
* }
*
* C++ code should use icu::ErrorCode, available in unicode/errorcode.h, or a
* suitable subclass.
*
* For more information, see:
* https://unicode-org.github.io/icu/userguide/dev/codingguidelines#details-about-icu-error-codes
*
* Note: By convention, ICU functions that take a reference (C++) or a pointer
* (C) to a UErrorCode first test:
*
* if (U_FAILURE(errorCode)) { return immediately; }
*
* so that in a chain of such functions the first one that sets an error code
* causes the following ones to not perform any operations.
*
* @stable ICU 2.0
*/
typedef enum UErrorCode {
/* The ordering of U_ERROR_INFO_START Vs U_USING_FALLBACK_WARNING looks weird
* and is that way because VC++ debugger displays first encountered constant,
* which is not the what the code is used for
*/
U_USING_FALLBACK_WARNING = -128, /**< A resource bundle lookup returned a fallback result (not an error) */
U_ERROR_WARNING_START = -128, /**< Start of information results (semantically successful) */
U_USING_DEFAULT_WARNING = -127, /**< A resource bundle lookup returned a result from the root locale (not an error) */
U_SAFECLONE_ALLOCATED_WARNING = -126, /**< A SafeClone operation required allocating memory (informational only) */
U_STATE_OLD_WARNING = -125, /**< ICU has to use compatibility layer to construct the service. Expect performance/memory usage degradation. Consider upgrading */
U_STRING_NOT_TERMINATED_WARNING = -124,/**< An output string could not be NUL-terminated because output length==destCapacity. */
U_SORT_KEY_TOO_SHORT_WARNING = -123, /**< Number of levels requested in getBound is higher than the number of levels in the sort key */
U_AMBIGUOUS_ALIAS_WARNING = -122, /**< This converter alias can go to different converter implementations */
U_DIFFERENT_UCA_VERSION = -121, /**< ucol_open encountered a mismatch between UCA version and collator image version, so the collator was constructed from rules. No impact to further function */
U_PLUGIN_CHANGED_LEVEL_WARNING = -120, /**< A plugin caused a level change. May not be an error, but later plugins may not load. */
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the highest normal UErrorCode warning value.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
U_ERROR_WARNING_LIMIT,
#endif // U_HIDE_DEPRECATED_API
U_ZERO_ERROR = 0, /**< No error, no warning. */
U_ILLEGAL_ARGUMENT_ERROR = 1, /**< Start of codes indicating failure */
U_MISSING_RESOURCE_ERROR = 2, /**< The requested resource cannot be found */
U_INVALID_FORMAT_ERROR = 3, /**< Data format is not what is expected */
U_FILE_ACCESS_ERROR = 4, /**< The requested file cannot be found */
U_INTERNAL_PROGRAM_ERROR = 5, /**< Indicates a bug in the library code */
U_MESSAGE_PARSE_ERROR = 6, /**< Unable to parse a message (message format) */
U_MEMORY_ALLOCATION_ERROR = 7, /**< Memory allocation error */
U_INDEX_OUTOFBOUNDS_ERROR = 8, /**< Trying to access the index that is out of bounds */
U_PARSE_ERROR = 9, /**< Equivalent to Java ParseException */
U_INVALID_CHAR_FOUND = 10, /**< Character conversion: Unmappable input sequence. In other APIs: Invalid character. */
U_TRUNCATED_CHAR_FOUND = 11, /**< Character conversion: Incomplete input sequence. */
U_ILLEGAL_CHAR_FOUND = 12, /**< Character conversion: Illegal input sequence/combination of input units. */
U_INVALID_TABLE_FORMAT = 13, /**< Conversion table file found, but corrupted */
U_INVALID_TABLE_FILE = 14, /**< Conversion table file not found */
U_BUFFER_OVERFLOW_ERROR = 15, /**< A result would not fit in the supplied buffer */
U_UNSUPPORTED_ERROR = 16, /**< Requested operation not supported in current context */
U_RESOURCE_TYPE_MISMATCH = 17, /**< an operation is requested over a resource that does not support it */
U_ILLEGAL_ESCAPE_SEQUENCE = 18, /**< ISO-2022 illegal escape sequence */
U_UNSUPPORTED_ESCAPE_SEQUENCE = 19, /**< ISO-2022 unsupported escape sequence */
U_NO_SPACE_AVAILABLE = 20, /**< No space available for in-buffer expansion for Arabic shaping */
U_CE_NOT_FOUND_ERROR = 21, /**< Currently used only while setting variable top, but can be used generally */
U_PRIMARY_TOO_LONG_ERROR = 22, /**< User tried to set variable top to a primary that is longer than two bytes */
U_STATE_TOO_OLD_ERROR = 23, /**< ICU cannot construct a service from this state, as it is no longer supported */
U_TOO_MANY_ALIASES_ERROR = 24, /**< There are too many aliases in the path to the requested resource.
It is very possible that a circular alias definition has occurred */
U_ENUM_OUT_OF_SYNC_ERROR = 25, /**< UEnumeration out of sync with underlying collection */
U_INVARIANT_CONVERSION_ERROR = 26, /**< Unable to convert a UChar* string to char* with the invariant converter. */
U_INVALID_STATE_ERROR = 27, /**< Requested operation can not be completed with ICU in its current state */
U_COLLATOR_VERSION_MISMATCH = 28, /**< Collator version is not compatible with the base version */
U_USELESS_COLLATOR_ERROR = 29, /**< Collator is options only and no base is specified */
U_NO_WRITE_PERMISSION = 30, /**< Attempt to modify read-only or constant data. */
/**
* The input is impractically long for an operation.
* It is rejected because it may lead to problems such as excessive
* processing time, stack depth, or heap memory requirements.
*
* @stable ICU 68
*/
U_INPUT_TOO_LONG_ERROR = 31,
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the highest standard error code.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
U_STANDARD_ERROR_LIMIT = 32,
#endif // U_HIDE_DEPRECATED_API
/*
* Error codes in the range 0x10000 0x10100 are reserved for Transliterator.
*/
U_BAD_VARIABLE_DEFINITION=0x10000,/**< Missing '$' or duplicate variable name */
U_PARSE_ERROR_START = 0x10000, /**< Start of Transliterator errors */
U_MALFORMED_RULE, /**< Elements of a rule are misplaced */
U_MALFORMED_SET, /**< A UnicodeSet pattern is invalid*/
U_MALFORMED_SYMBOL_REFERENCE, /**< UNUSED as of ICU 2.4 */
U_MALFORMED_UNICODE_ESCAPE, /**< A Unicode escape pattern is invalid*/
U_MALFORMED_VARIABLE_DEFINITION, /**< A variable definition is invalid */
U_MALFORMED_VARIABLE_REFERENCE, /**< A variable reference is invalid */
U_MISMATCHED_SEGMENT_DELIMITERS, /**< UNUSED as of ICU 2.4 */
U_MISPLACED_ANCHOR_START, /**< A start anchor appears at an illegal position */
U_MISPLACED_CURSOR_OFFSET, /**< A cursor offset occurs at an illegal position */
U_MISPLACED_QUANTIFIER, /**< A quantifier appears after a segment close delimiter */
U_MISSING_OPERATOR, /**< A rule contains no operator */
U_MISSING_SEGMENT_CLOSE, /**< UNUSED as of ICU 2.4 */
U_MULTIPLE_ANTE_CONTEXTS, /**< More than one ante context */
U_MULTIPLE_CURSORS, /**< More than one cursor */
U_MULTIPLE_POST_CONTEXTS, /**< More than one post context */
U_TRAILING_BACKSLASH, /**< A dangling backslash */
U_UNDEFINED_SEGMENT_REFERENCE, /**< A segment reference does not correspond to a defined segment */
U_UNDEFINED_VARIABLE, /**< A variable reference does not correspond to a defined variable */
U_UNQUOTED_SPECIAL, /**< A special character was not quoted or escaped */
U_UNTERMINATED_QUOTE, /**< A closing single quote is missing */
U_RULE_MASK_ERROR, /**< A rule is hidden by an earlier more general rule */
U_MISPLACED_COMPOUND_FILTER, /**< A compound filter is in an invalid location */
U_MULTIPLE_COMPOUND_FILTERS, /**< More than one compound filter */
U_INVALID_RBT_SYNTAX, /**< A "::id" rule was passed to the RuleBasedTransliterator parser */
U_INVALID_PROPERTY_PATTERN, /**< UNUSED as of ICU 2.4 */
U_MALFORMED_PRAGMA, /**< A 'use' pragma is invalid */
U_UNCLOSED_SEGMENT, /**< A closing ')' is missing */
U_ILLEGAL_CHAR_IN_SEGMENT, /**< UNUSED as of ICU 2.4 */
U_VARIABLE_RANGE_EXHAUSTED, /**< Too many stand-ins generated for the given variable range */
U_VARIABLE_RANGE_OVERLAP, /**< The variable range overlaps characters used in rules */
U_ILLEGAL_CHARACTER, /**< A special character is outside its allowed context */
U_INTERNAL_TRANSLITERATOR_ERROR, /**< Internal transliterator system error */
U_INVALID_ID, /**< A "::id" rule specifies an unknown transliterator */
U_INVALID_FUNCTION, /**< A "&fn()" rule specifies an unknown transliterator */
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the highest normal Transliterator error code.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
U_PARSE_ERROR_LIMIT,
#endif // U_HIDE_DEPRECATED_API
/*
* Error codes in the range 0x10100 0x10200 are reserved for the formatting API.
*/
U_UNEXPECTED_TOKEN=0x10100, /**< Syntax error in format pattern */
U_FMT_PARSE_ERROR_START=0x10100, /**< Start of format library errors */
U_MULTIPLE_DECIMAL_SEPARATORS, /**< More than one decimal separator in number pattern */
U_MULTIPLE_DECIMAL_SEPERATORS = U_MULTIPLE_DECIMAL_SEPARATORS, /**< Typo: kept for backward compatibility. Use U_MULTIPLE_DECIMAL_SEPARATORS */
U_MULTIPLE_EXPONENTIAL_SYMBOLS, /**< More than one exponent symbol in number pattern */
U_MALFORMED_EXPONENTIAL_PATTERN, /**< Grouping symbol in exponent pattern */
U_MULTIPLE_PERCENT_SYMBOLS, /**< More than one percent symbol in number pattern */
U_MULTIPLE_PERMILL_SYMBOLS, /**< More than one permill symbol in number pattern */
U_MULTIPLE_PAD_SPECIFIERS, /**< More than one pad symbol in number pattern */
U_PATTERN_SYNTAX_ERROR, /**< Syntax error in format pattern */
U_ILLEGAL_PAD_POSITION, /**< Pad symbol misplaced in number pattern */
U_UNMATCHED_BRACES, /**< Braces do not match in message pattern */
U_UNSUPPORTED_PROPERTY, /**< UNUSED as of ICU 2.4 */
U_UNSUPPORTED_ATTRIBUTE, /**< UNUSED as of ICU 2.4 */
U_ARGUMENT_TYPE_MISMATCH, /**< Argument name and argument index mismatch in MessageFormat functions */
U_DUPLICATE_KEYWORD, /**< Duplicate keyword in PluralFormat */
U_UNDEFINED_KEYWORD, /**< Undefined Plural keyword */
U_DEFAULT_KEYWORD_MISSING, /**< Missing DEFAULT rule in plural rules */
U_DECIMAL_NUMBER_SYNTAX_ERROR, /**< Decimal number syntax error */
U_FORMAT_INEXACT_ERROR, /**< Cannot format a number exactly and rounding mode is ROUND_UNNECESSARY @stable ICU 4.8 */
U_NUMBER_ARG_OUTOFBOUNDS_ERROR, /**< The argument to a NumberFormatter helper method was out of bounds; the bounds are usually 0 to 999. @stable ICU 61 */
U_NUMBER_SKELETON_SYNTAX_ERROR, /**< The number skeleton passed to C++ NumberFormatter or C UNumberFormatter was invalid or contained a syntax error. @stable ICU 62 */
/* MessageFormat 2.0 errors */
U_MF_UNRESOLVED_VARIABLE_ERROR, /**< A variable is referred to but not bound by any definition @internal ICU 75 technology preview @deprecated This API is for technology preview only. */
U_MF_SYNTAX_ERROR, /**< Includes all syntax errors @internal ICU 75 technology preview @deprecated This API is for technology preview only. */
U_MF_UNKNOWN_FUNCTION_ERROR, /**< An annotation refers to a function not defined by the standard or custom function registry @internal ICU 75 technology preview @deprecated This API is for technology preview only. */
U_MF_VARIANT_KEY_MISMATCH_ERROR, /**< In a match-construct, one or more variants had a different number of keys from the number of selectors @internal ICU 75 technology preview @deprecated This API is for technology preview only. */
U_MF_FORMATTING_ERROR, /**< Covers all runtime errors: for example, an internally inconsistent set of options. @internal ICU 75 technology preview @deprecated This API is for technology preview only. */
U_MF_NONEXHAUSTIVE_PATTERN_ERROR, /**< In a match-construct, the variants do not cover all possible values @internal ICU 75 technology preview @deprecated This API is for technology preview only. */
U_MF_DUPLICATE_OPTION_NAME_ERROR, /**< In an annotation, the same option name appears more than once @internal ICU 75 technology preview @deprecated This API is for technology preview only. */
U_MF_SELECTOR_ERROR, /**< A selector function is applied to an operand of the wrong type @internal ICU 75 technology preview @deprecated This API is for technology preview only. */
U_MF_MISSING_SELECTOR_ANNOTATION_ERROR, /**< A selector expression evaluates to an unannotated operand. @internal ICU 75 technology preview @deprecated This API is for technology preview only. */
U_MF_DUPLICATE_DECLARATION_ERROR, /**< The same variable is declared in more than one .local or .input declaration. @internal ICU 75 technology preview @deprecated This API is for technology preview only. */
U_MF_OPERAND_MISMATCH_ERROR, /**< An operand provided to a function does not have the required form for that function @internal ICU 75 technology preview @deprecated This API is for technology preview only. */
U_MF_DUPLICATE_VARIANT_ERROR, /**< A message includes a variant with the same key list as another variant. @internal ICU 76 technology preview @deprecated This API is for technology preview only. */
U_MF_BAD_OPTION, /**< An option value provided to a function does not have the required form for that option. @internal ICU 77 technology preview @deprecated This API is for technology preview only. */
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the highest normal formatting API error code.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
U_FMT_PARSE_ERROR_LIMIT = 0x10121,
#endif // U_HIDE_DEPRECATED_API
/*
* Error codes in the range 0x10200 0x102ff are reserved for BreakIterator.
*/
U_BRK_INTERNAL_ERROR=0x10200, /**< An internal error (bug) was detected. */
U_BRK_ERROR_START=0x10200, /**< Start of codes indicating Break Iterator failures */
U_BRK_HEX_DIGITS_EXPECTED, /**< Hex digits expected as part of a escaped char in a rule. */
U_BRK_SEMICOLON_EXPECTED, /**< Missing ';' at the end of a RBBI rule. */
U_BRK_RULE_SYNTAX, /**< Syntax error in RBBI rule. */
U_BRK_UNCLOSED_SET, /**< UnicodeSet writing an RBBI rule missing a closing ']'. */
U_BRK_ASSIGN_ERROR, /**< Syntax error in RBBI rule assignment statement. */
U_BRK_VARIABLE_REDFINITION, /**< RBBI rule $Variable redefined. */
U_BRK_MISMATCHED_PAREN, /**< Mis-matched parentheses in an RBBI rule. */
U_BRK_NEW_LINE_IN_QUOTED_STRING, /**< Missing closing quote in an RBBI rule. */
U_BRK_UNDEFINED_VARIABLE, /**< Use of an undefined $Variable in an RBBI rule. */
U_BRK_INIT_ERROR, /**< Initialization failure. Probable missing ICU Data. */
U_BRK_RULE_EMPTY_SET, /**< Rule contains an empty Unicode Set. */
U_BRK_UNRECOGNIZED_OPTION, /**< !!option in RBBI rules not recognized. */
U_BRK_MALFORMED_RULE_TAG, /**< The {nnn} tag on a rule is malformed */
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the highest normal BreakIterator error code.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
U_BRK_ERROR_LIMIT,
#endif // U_HIDE_DEPRECATED_API
/*
* Error codes in the range 0x10300-0x103ff are reserved for regular expression related errors.
*/
U_REGEX_INTERNAL_ERROR=0x10300, /**< An internal error (bug) was detected. */
U_REGEX_ERROR_START=0x10300, /**< Start of codes indicating Regexp failures */
U_REGEX_RULE_SYNTAX, /**< Syntax error in regexp pattern. */
U_REGEX_INVALID_STATE, /**< RegexMatcher in invalid state for requested operation */
U_REGEX_BAD_ESCAPE_SEQUENCE, /**< Unrecognized backslash escape sequence in pattern */
U_REGEX_PROPERTY_SYNTAX, /**< Incorrect Unicode property */
U_REGEX_UNIMPLEMENTED, /**< Use of regexp feature that is not yet implemented. */
U_REGEX_MISMATCHED_PAREN, /**< Incorrectly nested parentheses in regexp pattern. */
U_REGEX_NUMBER_TOO_BIG, /**< Decimal number is too large. */
U_REGEX_BAD_INTERVAL, /**< Error in {min,max} interval */
U_REGEX_MAX_LT_MIN, /**< In {min,max}, max is less than min. */
U_REGEX_INVALID_BACK_REF, /**< Back-reference to a non-existent capture group. */
U_REGEX_INVALID_FLAG, /**< Invalid value for match mode flags. */
U_REGEX_LOOK_BEHIND_LIMIT, /**< Look-Behind pattern matches must have a bounded maximum length. */
U_REGEX_SET_CONTAINS_STRING, /**< Regexps cannot have UnicodeSets containing strings.*/
#ifndef U_HIDE_DEPRECATED_API
U_REGEX_OCTAL_TOO_BIG, /**< Octal character constants must be <= 0377. @deprecated ICU 54. This error cannot occur. */
#endif /* U_HIDE_DEPRECATED_API */
U_REGEX_MISSING_CLOSE_BRACKET=U_REGEX_SET_CONTAINS_STRING+2, /**< Missing closing bracket on a bracket expression. */
U_REGEX_INVALID_RANGE, /**< In a character range [x-y], x is greater than y. */
U_REGEX_STACK_OVERFLOW, /**< Regular expression backtrack stack overflow. */
U_REGEX_TIME_OUT, /**< Maximum allowed match time exceeded */
U_REGEX_STOPPED_BY_CALLER, /**< Matching operation aborted by user callback fn. */
U_REGEX_PATTERN_TOO_BIG, /**< Pattern exceeds limits on size or complexity. @stable ICU 55 */
U_REGEX_INVALID_CAPTURE_GROUP_NAME, /**< Invalid capture group name. @stable ICU 55 */
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the highest normal regular expression error code.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
U_REGEX_ERROR_LIMIT=U_REGEX_STOPPED_BY_CALLER+3,
#endif // U_HIDE_DEPRECATED_API
/*
* Error codes in the range 0x10400-0x104ff are reserved for IDNA related error codes.
*/
U_IDNA_PROHIBITED_ERROR=0x10400,
U_IDNA_ERROR_START=0x10400,
U_IDNA_UNASSIGNED_ERROR,
U_IDNA_CHECK_BIDI_ERROR,
U_IDNA_STD3_ASCII_RULES_ERROR,
U_IDNA_ACE_PREFIX_ERROR,
U_IDNA_VERIFICATION_ERROR,
U_IDNA_LABEL_TOO_LONG_ERROR,
U_IDNA_ZERO_LENGTH_LABEL_ERROR,
U_IDNA_DOMAIN_NAME_TOO_LONG_ERROR,
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the highest normal IDNA error code.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
U_IDNA_ERROR_LIMIT,
#endif // U_HIDE_DEPRECATED_API
/*
* Aliases for StringPrep
*/
U_STRINGPREP_PROHIBITED_ERROR = U_IDNA_PROHIBITED_ERROR,
U_STRINGPREP_UNASSIGNED_ERROR = U_IDNA_UNASSIGNED_ERROR,
U_STRINGPREP_CHECK_BIDI_ERROR = U_IDNA_CHECK_BIDI_ERROR,
/*
* Error codes in the range 0x10500-0x105ff are reserved for Plugin related error codes.
*/
U_PLUGIN_ERROR_START=0x10500, /**< Start of codes indicating plugin failures */
U_PLUGIN_TOO_HIGH=0x10500, /**< The plugin's level is too high to be loaded right now. */
U_PLUGIN_DIDNT_SET_LEVEL, /**< The plugin didn't call uplug_setPlugLevel in response to a QUERY */
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the highest normal plug-in error code.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
U_PLUGIN_ERROR_LIMIT,
#endif // U_HIDE_DEPRECATED_API
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the highest normal error code.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
U_ERROR_LIMIT=U_PLUGIN_ERROR_LIMIT
#endif // U_HIDE_DEPRECATED_API
} UErrorCode;
/* Use the following to determine if an UErrorCode represents */
/* operational success or failure. */
#ifdef __cplusplus
/**
* Does the error code indicate success?
* @stable ICU 2.0
*/
static
inline UBool U_SUCCESS(UErrorCode code) { return code <= U_ZERO_ERROR; }
/**
* Does the error code indicate a failure?
* @stable ICU 2.0
*/
static
inline UBool U_FAILURE(UErrorCode code) { return code > U_ZERO_ERROR; }
#else
/**
* Does the error code indicate success?
* @stable ICU 2.0
*/
# define U_SUCCESS(x) ((x)<=U_ZERO_ERROR)
/**
* Does the error code indicate a failure?
* @stable ICU 2.0
*/
# define U_FAILURE(x) ((x)>U_ZERO_ERROR)
#endif
/**
* Return a string for a UErrorCode value.
* The string will be the same as the name of the error code constant
* in the UErrorCode enum above.
* @stable ICU 2.0
*/
U_CAPI const char * U_EXPORT2
u_errorName(UErrorCode code);
#endif /* _UTYPES */

View File

@@ -0,0 +1,191 @@
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2000-2016, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*
* file name: uvernum.h
* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
* Created by: Vladimir Weinstein
* Updated by: Steven R. Loomis
*
*/
/**
* \file
* \brief C API: definitions of ICU version numbers
*
* This file is included by uversion.h and other files. This file contains only
* macros and definitions. The actual version numbers are defined here.
*/
/*
* IMPORTANT: When updating version, the following things need to be done:
* source/common/unicode/uvernum.h - this file: update major, minor,
* patchlevel, suffix, version, short version constants, namespace,
* renaming macro, and copyright
*
* The following files need to be updated as well, which can be done
* by running the UNIX makefile target 'update-windows-makefiles' in icu4c/source.
*
* source/allinone/Build.Windows.IcuVersion.props - Update the IcuMajorVersion
* source/data/makedata.mak - change U_ICUDATA_NAME so that it contains
* the new major/minor combination, and UNICODE_VERSION
* for the Unicode version.
*/
#ifndef UVERNUM_H
#define UVERNUM_H
/** The standard copyright notice that gets compiled into each library.
* This value will change in the subsequent releases of ICU
* @stable ICU 2.4
*/
#define U_COPYRIGHT_STRING \
" Copyright (C) 2016 and later: Unicode, Inc. and others. License & terms of use: http://www.unicode.org/copyright.html "
/** The current ICU major version as an integer.
* This value will change in the subsequent releases of ICU
* @stable ICU 2.4
*/
#define U_ICU_VERSION_MAJOR_NUM 77
/** The current ICU minor version as an integer.
* This value will change in the subsequent releases of ICU
* @stable ICU 2.6
*/
#define U_ICU_VERSION_MINOR_NUM 1
/** The current ICU patchlevel version as an integer.
* This value will change in the subsequent releases of ICU
* @stable ICU 2.4
*/
#define U_ICU_VERSION_PATCHLEVEL_NUM 0
/** The current ICU build level version as an integer.
* This value is for use by ICU clients. It defaults to 0.
* @stable ICU 4.0
*/
#ifndef U_ICU_VERSION_BUILDLEVEL_NUM
#define U_ICU_VERSION_BUILDLEVEL_NUM 0
#endif
/** Glued version suffix for renamers
* This value will change in the subsequent releases of ICU
* @stable ICU 2.6
*/
#define U_ICU_VERSION_SUFFIX _77
/**
* \def U_DEF2_ICU_ENTRY_POINT_RENAME
* @internal
*/
/**
* \def U_DEF_ICU_ENTRY_POINT_RENAME
* @internal
*/
/** Glued version suffix function for renamers
* This value will change in the subsequent releases of ICU.
* If a custom suffix (such as matching library suffixes) is desired, this can be modified.
* Note that if present, platform.h may contain an earlier definition of this macro.
* \def U_ICU_ENTRY_POINT_RENAME
* @stable ICU 4.2
*/
/**
* Disable the version suffix. Use the custom suffix if exists.
* \def U_DISABLE_VERSION_SUFFIX
* @internal
*/
#ifndef U_DISABLE_VERSION_SUFFIX
#define U_DISABLE_VERSION_SUFFIX 0
#endif
#ifndef U_ICU_ENTRY_POINT_RENAME
#ifdef U_HAVE_LIB_SUFFIX
# if !U_DISABLE_VERSION_SUFFIX
# define U_DEF_ICU_ENTRY_POINT_RENAME(x,y,z) x ## y ## z
# define U_DEF2_ICU_ENTRY_POINT_RENAME(x,y,z) U_DEF_ICU_ENTRY_POINT_RENAME(x,y,z)
# define U_ICU_ENTRY_POINT_RENAME(x) U_DEF2_ICU_ENTRY_POINT_RENAME(x,U_ICU_VERSION_SUFFIX,U_LIB_SUFFIX_C_NAME)
# else
# define U_DEF_ICU_ENTRY_POINT_RENAME(x,y) x ## y
# define U_DEF2_ICU_ENTRY_POINT_RENAME(x,y) U_DEF_ICU_ENTRY_POINT_RENAME(x,y)
# define U_ICU_ENTRY_POINT_RENAME(x) U_DEF2_ICU_ENTRY_POINT_RENAME(x,U_LIB_SUFFIX_C_NAME)
# endif
#else
# if !U_DISABLE_VERSION_SUFFIX
# define U_DEF_ICU_ENTRY_POINT_RENAME(x,y) x ## y
# define U_DEF2_ICU_ENTRY_POINT_RENAME(x,y) U_DEF_ICU_ENTRY_POINT_RENAME(x,y)
# define U_ICU_ENTRY_POINT_RENAME(x) U_DEF2_ICU_ENTRY_POINT_RENAME(x,U_ICU_VERSION_SUFFIX)
# else
# define U_ICU_ENTRY_POINT_RENAME(x) x
# endif
#endif
#endif
/** The current ICU library version as a dotted-decimal string. The patchlevel
* only appears in this string if it non-zero.
* This value will change in the subsequent releases of ICU
* @stable ICU 2.4
*/
#define U_ICU_VERSION "77.1"
/**
* The current ICU library major version number as a string, for library name suffixes.
* This value will change in subsequent releases of ICU.
*
* Until ICU 4.8, this was the combination of the single-digit major and minor ICU version numbers
* into one string without dots ("48").
* Since ICU 49, it is the double-digit major ICU version number.
* See https://unicode-org.github.io/icu/userguide/design#version-numbers-in-icu
*
* @stable ICU 2.6
*/
#define U_ICU_VERSION_SHORT "77"
#ifndef U_HIDE_INTERNAL_API
/** Data version in ICU4C.
* @internal ICU 4.4 Internal Use Only
**/
#define U_ICU_DATA_VERSION "77.1"
#endif /* U_HIDE_INTERNAL_API */
/*===========================================================================
* ICU collation framework version information
* Version info that can be obtained from a collator is affected by these
* numbers in a secret and magic way. Please use collator version as whole
*===========================================================================
*/
/**
* Collation runtime version (sort key generator, strcoll).
* If the version is different, sort keys for the same string could be different.
* This value may change in subsequent releases of ICU.
* @stable ICU 2.4
*/
#define UCOL_RUNTIME_VERSION 9
/**
* Collation builder code version.
* When this is different, the same tailoring might result
* in assigning different collation elements to code points.
* This value may change in subsequent releases of ICU.
* @stable ICU 2.4
*/
#define UCOL_BUILDER_VERSION 9
#ifndef U_HIDE_DEPRECATED_API
/**
* Constant 1.
* This was intended to be the version of collation tailorings,
* but instead the tailoring data carries a version number.
* @deprecated ICU 54
*/
#define UCOL_TAILORINGS_VERSION 1
#endif /* U_HIDE_DEPRECATED_API */
#endif

Some files were not shown because too many files have changed in this diff Show More