first commit

2020-01-02 22:20:31 +07:00
commit 10eb3340ad
5753 changed files with 631345 additions and 0 deletions
@@ -0,0 +1,69 @@
+<?php
+/**
+* PCRE Regular expressions for UTF-8. Note this file is not actually used by
+* the rest of the library but these regular expressions can be useful to have
+* available.
+* @version $Id$
+* @see http://www.w3.org/International/questions/qa-forms-utf-8
+* @package utf8
+* @subpackage patterns
+*/
+
+//--------------------------------------------------------------------
+/**
+* PCRE Pattern to check a UTF-8 string is valid
+* Comes from W3 FAQ: Multilingual Forms
+* Note: modified to include full ASCII range including control chars
+* @see http://www.w3.org/International/questions/qa-forms-utf-8
+* @package utf8
+* @subpackage patterns
+*/
+$UTF8_VALID = '^('.
+    '[\x00-\x7F]'.                          # ASCII (including control chars)
+    '|[\xC2-\xDF][\x80-\xBF]'.              # non-overlong 2-byte
+    '|\xE0[\xA0-\xBF][\x80-\xBF]'.          # excluding overlongs
+    '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.   # straight 3-byte
+    '|\xED[\x80-\x9F][\x80-\xBF]'.          # excluding surrogates
+    '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.       # planes 1-3
+    '|[\xF1-\xF3][\x80-\xBF]{3}'.           # planes 4-15
+    '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.       # plane 16
+    ')*$';
+
+//--------------------------------------------------------------------
+/**
+* PCRE Pattern to match single UTF-8 characters
+* Comes from W3 FAQ: Multilingual Forms
+* Note: modified to include full ASCII range including control chars
+* @see http://www.w3.org/International/questions/qa-forms-utf-8
+* @package utf8
+* @subpackage patterns
+*/
+$UTF8_MATCH =
+    '([\x00-\x7F])'.                          # ASCII (including control chars)
+    '|([\xC2-\xDF][\x80-\xBF])'.              # non-overlong 2-byte
+    '|(\xE0[\xA0-\xBF][\x80-\xBF])'.          # excluding overlongs
+    '|([\xE1-\xEC\xEE\xEF][\x80-\xBF]{2})'.   # straight 3-byte
+    '|(\xED[\x80-\x9F][\x80-\xBF])'.          # excluding surrogates
+    '|(\xF0[\x90-\xBF][\x80-\xBF]{2})'.       # planes 1-3
+    '|([\xF1-\xF3][\x80-\xBF]{3})'.           # planes 4-15
+    '|(\xF4[\x80-\x8F][\x80-\xBF]{2})';       # plane 16
+
+//--------------------------------------------------------------------
+/**
+* PCRE Pattern to locate bad bytes in a UTF-8 string
+* Comes from W3 FAQ: Multilingual Forms
+* Note: modified to include full ASCII range including control chars
+* @see http://www.w3.org/International/questions/qa-forms-utf-8
+* @package utf8
+* @subpackage patterns
+*/
+$UTF8_BAD =
+    '([\x00-\x7F]'.                          # ASCII (including control chars)
+    '|[\xC2-\xDF][\x80-\xBF]'.               # non-overlong 2-byte
+    '|\xE0[\xA0-\xBF][\x80-\xBF]'.           # excluding overlongs
+    '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'.    # straight 3-byte
+    '|\xED[\x80-\x9F][\x80-\xBF]'.           # excluding surrogates
+    '|\xF0[\x90-\xBF][\x80-\xBF]{2}'.        # planes 1-3
+    '|[\xF1-\xF3][\x80-\xBF]{3}'.            # planes 4-15
+    '|\xF4[\x80-\x8F][\x80-\xBF]{2}'.        # plane 16
+    '|(.{1}))';                              # invalid byte