ICU 69.1  69.1
uniset.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ***************************************************************************
5 * Copyright (C) 1999-2016, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 ***************************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 ***************************************************************************
11 */
12 
13 #ifndef UNICODESET_H
14 #define UNICODESET_H
15 
16 #include "unicode/utypes.h"
17 
18 #if U_SHOW_CPLUSPLUS_API
19 
20 #include "unicode/ucpmap.h"
21 #include "unicode/unifilt.h"
22 #include "unicode/unistr.h"
23 #include "unicode/uset.h"
24 
30 U_NAMESPACE_BEGIN
31 
32 // Forward Declarations.
33 class BMPSet;
34 class ParsePosition;
35 class RBBIRuleScanner;
36 class SymbolTable;
37 class UnicodeSetStringSpan;
38 class UVector;
39 class RuleCharacterIterator;
40 
280 private:
285  static constexpr int32_t INITIAL_CAPACITY = 25;
286  // fFlags constant
287  static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid)
288 
289  UChar32* list = stackList; // MUST be terminated with HIGH
290  int32_t capacity = INITIAL_CAPACITY; // capacity of list
291  int32_t len = 1; // length of list used; 1 <= len <= capacity
292  uint8_t fFlags = 0; // Bit flag (see constants above)
293 
294  BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not NULL.
295  UChar32* buffer = nullptr; // internal buffer, may be NULL
296  int32_t bufferCapacity = 0; // capacity of buffer
297 
307  char16_t *pat = nullptr;
308  int32_t patLen = 0;
309 
310  UVector* strings = nullptr; // maintained in sorted order
311  UnicodeSetStringSpan *stringSpan = nullptr;
312 
318  UChar32 stackList[INITIAL_CAPACITY];
319 
320 public:
330  inline UBool isBogus(void) const;
331 
348  void setToBogus();
349 
350 public:
351 
352  enum {
357  MIN_VALUE = 0,
358 
363  MAX_VALUE = 0x10ffff
364  };
365 
366  //----------------------------------------------------------------
367  // Constructors &c
368  //----------------------------------------------------------------
369 
370 public:
371 
376  UnicodeSet();
377 
386  UnicodeSet(UChar32 start, UChar32 end);
387 
388 #ifndef U_HIDE_INTERNAL_API
389 
393  kSerialized /* result of serialize() */
394  };
395 
406  UnicodeSet(const uint16_t buffer[], int32_t bufferLen,
407  ESerialization serialization, UErrorCode &status);
408 #endif /* U_HIDE_INTERNAL_API */
409 
418  UnicodeSet(const UnicodeString& pattern,
419  UErrorCode& status);
420 
421 #ifndef U_HIDE_INTERNAL_API
422 
434  UnicodeSet(const UnicodeString& pattern,
435  uint32_t options,
436  const SymbolTable* symbols,
437  UErrorCode& status);
438 #endif /* U_HIDE_INTERNAL_API */
439 
453  UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
454  uint32_t options,
455  const SymbolTable* symbols,
456  UErrorCode& status);
457 
462  UnicodeSet(const UnicodeSet& o);
463 
468  virtual ~UnicodeSet();
469 
475  UnicodeSet& operator=(const UnicodeSet& o);
476 
488  virtual UBool operator==(const UnicodeSet& o) const;
489 
495  inline UBool operator!=(const UnicodeSet& o) const;
496 
506  virtual UnicodeSet* clone() const;
507 
515  virtual int32_t hashCode(void) const;
516 
525  inline static UnicodeSet *fromUSet(USet *uset);
526 
535  inline static const UnicodeSet *fromUSet(const USet *uset);
536 
544  inline USet *toUSet();
545 
546 
554  inline const USet * toUSet() const;
555 
556 
557  //----------------------------------------------------------------
558  // Freezable API
559  //----------------------------------------------------------------
560 
569  inline UBool isFrozen() const;
570 
584  UnicodeSet *freeze();
585 
594  UnicodeSet *cloneAsThawed() const;
595 
596  //----------------------------------------------------------------
597  // Public API
598  //----------------------------------------------------------------
599 
609  UnicodeSet& set(UChar32 start, UChar32 end);
610 
616  static UBool resemblesPattern(const UnicodeString& pattern,
617  int32_t pos);
618 
631  UnicodeSet& applyPattern(const UnicodeString& pattern,
632  UErrorCode& status);
633 
634 #ifndef U_HIDE_INTERNAL_API
635 
651  UnicodeSet& applyPattern(const UnicodeString& pattern,
652  uint32_t options,
653  const SymbolTable* symbols,
654  UErrorCode& status);
655 #endif /* U_HIDE_INTERNAL_API */
656 
688  UnicodeSet& applyPattern(const UnicodeString& pattern,
689  ParsePosition& pos,
690  uint32_t options,
691  const SymbolTable* symbols,
692  UErrorCode& status);
693 
707  virtual UnicodeString& toPattern(UnicodeString& result,
708  UBool escapeUnprintable = false) const;
709 
732  UnicodeSet& applyIntPropertyValue(UProperty prop,
733  int32_t value,
734  UErrorCode& ec);
735 
765  UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
766  const UnicodeString& value,
767  UErrorCode& ec);
768 
777  virtual int32_t size(void) const;
778 
785  virtual UBool isEmpty(void) const;
786 
794  virtual UBool contains(UChar32 c) const;
795 
804  virtual UBool contains(UChar32 start, UChar32 end) const;
805 
813  UBool contains(const UnicodeString& s) const;
814 
822  virtual UBool containsAll(const UnicodeSet& c) const;
823 
831  UBool containsAll(const UnicodeString& s) const;
832 
841  UBool containsNone(UChar32 start, UChar32 end) const;
842 
850  UBool containsNone(const UnicodeSet& c) const;
851 
859  UBool containsNone(const UnicodeString& s) const;
860 
869  inline UBool containsSome(UChar32 start, UChar32 end) const;
870 
878  inline UBool containsSome(const UnicodeSet& s) const;
879 
887  inline UBool containsSome(const UnicodeString& s) const;
888 
907  int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
908 
921  inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
922 
940  int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
941 
955  inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
956 
975  int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
976 
994  int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
995 
1000  virtual UMatchDegree matches(const Replaceable& text,
1001  int32_t& offset,
1002  int32_t limit,
1003  UBool incremental);
1004 
1005 private:
1028  static int32_t matchRest(const Replaceable& text,
1029  int32_t start, int32_t limit,
1030  const UnicodeString& s);
1031 
1041  int32_t findCodePoint(UChar32 c) const;
1042 
1043 public:
1044 
1052  virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
1053 
1062  int32_t indexOf(UChar32 c) const;
1063 
1073  UChar32 charAt(int32_t index) const;
1074 
1089  virtual UnicodeSet& add(UChar32 start, UChar32 end);
1090 
1101  UnicodeSet& add(UChar32 c);
1102 
1114  UnicodeSet& add(const UnicodeString& s);
1115 
1116  private:
1122  static int32_t getSingleCP(const UnicodeString& s);
1123 
1124  void _add(const UnicodeString& s);
1125 
1126  public:
1135  UnicodeSet& addAll(const UnicodeString& s);
1136 
1144  UnicodeSet& retainAll(const UnicodeString& s);
1145 
1153  UnicodeSet& complementAll(const UnicodeString& s);
1154 
1162  UnicodeSet& removeAll(const UnicodeString& s);
1163 
1172  static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
1173 
1174 
1182  static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
1183 
1195  virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1196 
1197 
1206  UnicodeSet& retain(UChar32 c);
1207 
1208 #ifndef U_HIDE_DRAFT_API
1209 
1219  UnicodeSet& retain(const UnicodeString &s);
1220 #endif // U_HIDE_DRAFT_API
1221 
1235  virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1236 
1247  UnicodeSet& remove(UChar32 c);
1248 
1258  UnicodeSet& remove(const UnicodeString& s);
1259 
1267  virtual UnicodeSet& complement(void);
1268 
1281  virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1282 
1293  UnicodeSet& complement(UChar32 c);
1294 
1304  UnicodeSet& complement(const UnicodeString& s);
1305 
1318  virtual UnicodeSet& addAll(const UnicodeSet& c);
1319 
1331  virtual UnicodeSet& retainAll(const UnicodeSet& c);
1332 
1344  virtual UnicodeSet& removeAll(const UnicodeSet& c);
1345 
1356  virtual UnicodeSet& complementAll(const UnicodeSet& c);
1357 
1364  virtual UnicodeSet& clear(void);
1365 
1391  UnicodeSet& closeOver(int32_t attribute);
1392 
1399  virtual UnicodeSet &removeAllStrings();
1400 
1408  virtual int32_t getRangeCount(void) const;
1409 
1417  virtual UChar32 getRangeStart(int32_t index) const;
1418 
1426  virtual UChar32 getRangeEnd(int32_t index) const;
1427 
1476  int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
1477 
1484  virtual UnicodeSet& compact();
1485 
1497  static UClassID U_EXPORT2 getStaticClassID(void);
1498 
1507  virtual UClassID getDynamicClassID(void) const;
1508 
1509 private:
1510 
1511  // Private API for the USet API
1512 
1513  friend class USetAccess;
1514 
1515  const UnicodeString* getString(int32_t index) const;
1516 
1517  //----------------------------------------------------------------
1518  // RuleBasedTransliterator support
1519  //----------------------------------------------------------------
1520 
1521 private:
1522 
1528  virtual UBool matchesIndexValue(uint8_t v) const;
1529 
1530 private:
1531  friend class RBBIRuleScanner;
1532 
1533  //----------------------------------------------------------------
1534  // Implementation: Clone as thawed (see ICU4J Freezable)
1535  //----------------------------------------------------------------
1536 
1537  UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
1538  UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed);
1539 
1540  //----------------------------------------------------------------
1541  // Implementation: Pattern parsing
1542  //----------------------------------------------------------------
1543 
1544  void applyPatternIgnoreSpace(const UnicodeString& pattern,
1545  ParsePosition& pos,
1546  const SymbolTable* symbols,
1547  UErrorCode& status);
1548 
1549  void applyPattern(RuleCharacterIterator& chars,
1550  const SymbolTable* symbols,
1551  UnicodeString& rebuiltPat,
1552  uint32_t options,
1553  UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
1554  int32_t depth,
1555  UErrorCode& ec);
1556 
1557  //----------------------------------------------------------------
1558  // Implementation: Utility methods
1559  //----------------------------------------------------------------
1560 
1561  static int32_t nextCapacity(int32_t minCapacity);
1562 
1563  bool ensureCapacity(int32_t newLen);
1564 
1565  bool ensureBufferCapacity(int32_t newLen);
1566 
1567  void swapBuffers(void);
1568 
1569  UBool allocateStrings(UErrorCode &status);
1570  UBool hasStrings() const;
1571  int32_t stringsSize() const;
1572  UBool stringsContains(const UnicodeString &s) const;
1573 
1574  UnicodeString& _toPattern(UnicodeString& result,
1575  UBool escapeUnprintable) const;
1576 
1577  UnicodeString& _generatePattern(UnicodeString& result,
1578  UBool escapeUnprintable) const;
1579 
1580  static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1581 
1582  static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1583 
1584  //----------------------------------------------------------------
1585  // Implementation: Fundamental operators
1586  //----------------------------------------------------------------
1587 
1588  void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1589 
1590  void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1591 
1592  void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1593 
1599  static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1600  int32_t pos);
1601 
1602  static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1603  int32_t iterOpts);
1604 
1644  UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1645  ParsePosition& ppos,
1646  UErrorCode &ec);
1647 
1648  void applyPropertyPattern(RuleCharacterIterator& chars,
1649  UnicodeString& rebuiltPat,
1650  UErrorCode& ec);
1651 
1652  static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
1653 
1658  typedef UBool (*Filter)(UChar32 codePoint, void* context);
1659 
1669  void applyFilter(Filter filter,
1670  void* context,
1671  const UnicodeSet* inclusions,
1672  UErrorCode &status);
1673 
1674  // UCPMap is now stable ICU 63
1675  void applyIntPropertyValue(const UCPMap *map,
1676  UCPMapValueFilter *filter, const void *context,
1677  UErrorCode &errorCode);
1678 
1682  void setPattern(const UnicodeString& newPat) {
1683  setPattern(newPat.getBuffer(), newPat.length());
1684  }
1685  void setPattern(const char16_t *newPat, int32_t newPatLen);
1689  void releasePattern();
1690 
1691  friend class UnicodeSetIterator;
1692 };
1693 
1694 
1695 
1696 inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
1697  return !operator==(o);
1698 }
1699 
1700 inline UBool UnicodeSet::isFrozen() const {
1701  return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
1702 }
1703 
1704 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1705  return !containsNone(start, end);
1706 }
1707 
1708 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
1709  return !containsNone(s);
1710 }
1711 
1712 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
1713  return !containsNone(s);
1714 }
1715 
1716 inline UBool UnicodeSet::isBogus() const {
1717  return (UBool)(fFlags & kIsBogus);
1718 }
1719 
1720 inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) {
1721  return reinterpret_cast<UnicodeSet *>(uset);
1722 }
1723 
1724 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
1725  return reinterpret_cast<const UnicodeSet *>(uset);
1726 }
1727 
1728 inline USet *UnicodeSet::toUSet() {
1729  return reinterpret_cast<USet *>(this);
1730 }
1731 
1732 inline const USet *UnicodeSet::toUSet() const {
1733  return reinterpret_cast<const USet *>(this);
1734 }
1735 
1736 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
1737  int32_t sLength=s.length();
1738  if(start<0) {
1739  start=0;
1740  } else if(start>sLength) {
1741  start=sLength;
1742  }
1743  return start+span(s.getBuffer()+start, sLength-start, spanCondition);
1744 }
1745 
1746 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
1747  int32_t sLength=s.length();
1748  if(limit<0) {
1749  limit=0;
1750  } else if(limit>sLength) {
1751  limit=sLength;
1752  }
1753  return spanBack(s.getBuffer(), limit, spanCondition);
1754 }
1755 
1756 U_NAMESPACE_END
1757 
1758 #endif /* U_SHOW_CPLUSPLUS_API */
1759 
1760 #endif
#define INITIAL_CAPACITY
The initial size of an array if it is unspecified.
Definition: RunArrays.h:32
static UClassID getStaticClassID()
ICU &quot;poor man&#39;s RTTI&quot;, returns a UClassID for this class.
struct UCPMap UCPMap
Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values.
Definition: ucpmap.h:31
UMatchDegree
Constants returned by UnicodeMatcher::matches() indicating the degree of match.
Definition: unimatch.h:33
C++ API: Unicode String.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
void * UClassID
UClassID is used to identify classes without using the compiler&#39;s RTTI.
Definition: uobject.h:96
This file defines an abstract map from Unicode code points to integer values.
virtual UBool matchesIndexValue(uint8_t v) const =0
Returns true if this matcher will match a character c, where c &amp; 0xFF == v, at offset, in the forward direction (with limit &gt; offset).
C API: Unicode Set.
An interface that defines both lookup protocol and parsing of symbolic names.
Definition: symtable.h:59
virtual UnicodeString & toPattern(UnicodeString &result, UBool escapeUnprintable=false) const =0
Returns a string representation of this matcher.
virtual UClassID getDynamicClassID(void) const =0
Returns a unique class ID polymorphically.
Replaceable is an abstract base class representing a string of characters that supports the replaceme...
Definition: rep.h:77
UnicodeFilter defines a protocol for selecting a subset of the full range (U+0000 to U+10FFFF) of Uni...
Definition: unifilt.h:65
virtual void addMatchSetTo(UnicodeSet &toUnionTo) const =0
Union the set of all characters that may be matched by this object into the given set...
UBool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:335
uint32_t UCPMapValueFilter(const void *context, uint32_t value)
Callback function type: Modifies a map value.
Definition: ucpmap.h:114
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:467
#define NULL
Define NULL if necessary, to nullptr for C++ and to ((void *)0) for C.
Definition: utypes.h:188
virtual UMatchDegree matches(const Replaceable &text, int32_t &offset, int32_t limit, UBool incremental)
Implement UnicodeMatcher API.
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:279
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:159
UProperty
Selection constants for Unicode properties.
Definition: uchar.h:195
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition: utypes.h:415
struct USet USet
USet is the C API type corresponding to C++ class UnicodeSet.
Definition: uset.h:50
ParsePosition is a simple class used by Format and its subclasses to keep track of the current positi...
Definition: parsepos.h:52
#define U_FINAL
Defined to the C++11 &quot;final&quot; keyword if available.
Definition: umachine.h:141
char16_t * getBuffer(int32_t minCapacity)
Get a read/write pointer to the internal buffer.
virtual UnicodeFilter * clone() const =0
Clones this object polymorphically.
Basic definitions for ICU, for both C and C++ APIs.
virtual UBool contains(UChar32 c) const =0
Returns true for characters that are in the selected subset.
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:300
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:295
C++ API: Unicode Filter.
int32_t length(void) const
Return the length of the UnicodeString object.
Definition: unistr.h:3890
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition: umachine.h:269