[筆記] 用 Javascript/PHP 從字串中提取 hashtag

最近需要弄一個提取 hashtag 的功能,Google 了一下答案都不盡理想,

突然想到 Facebook 本身在發文的時候就有 hashtag 功能,就翻了一下原始碼,

果真在 JS 中找到了做法,不愧是大公司的程式,寫的毫不馬虎,各種語言文字符號通通考慮到,

正所謂前人種樹,後人乘涼,人家都測試好的程式當然就要直接拿來用,可以省下不少 Debug 的時間(?)

Stackoverflow 上查到的規則多數都太過簡單,沒有較深的驗證規則,不喜翻。

像 Facebook 上的這種規則就設計的很好,所以直接用他們的程式就不用另外刻那麼多規則了(?

 

查找一下規則是寫在 bCEuPXQ-N0m.js 中的 getHashtagRegexString

 

原始程式是:

function h() {
    var i = '\xc0-\xd6' + '\xd8-\xf6' + '\xf8-\xff' + '\u0100-\u024f' + '\u0253-\u0254' + '\u0256-\u0257' + '\u0259' + '\u025b' + '\u0263' + '\u0268' + '\u026f' + '\u0272' + '\u0289' + '\u028b' + '\u02bb' + '\u0300-\u036f' + '\u1e00-\u1eff',
        j = '\u0400-\u04ff' + '\u0500-\u0527' + '\u2de0-\u2dff' + '\ua640-\ua69f' + '\u0591-\u05bf' + '\u05c1-\u05c2' + '\u05c4-\u05c5' + '\u05c7' + '\u05d0-\u05ea' + '\u05f0-\u05f4' + '\ufb12-\ufb28' + '\ufb2a-\ufb36' + '\ufb38-\ufb3c' + '\ufb3e' + '\ufb40-\ufb41' + '\ufb43-\ufb44' + '\ufb46-\ufb4f' + '\u0610-\u061a' + '\u0620-\u065f' + '\u066e-\u06d3' + '\u06d5-\u06dc' + '\u06de-\u06e8' + '\u06ea-\u06ef' + '\u06fa-\u06fc' + '\u06ff' + '\u0750-\u077f' + '\u08a0' + '\u08a2-\u08ac' + '\u08e4-\u08fe' + '\ufb50-\ufbb1' + '\ufbd3-\ufd3d' + '\ufd50-\ufd8f' + '\ufd92-\ufdc7' + '\ufdf0-\ufdfb' + '\ufe70-\ufe74' + '\ufe76-\ufefc' + '\u200c-\u200c' + '\u0e01-\u0e3a' + '\u0e40-\u0e4e' + '\u1100-\u11ff' + '\u3130-\u3185' + '\uA960-\uA97F' + '\uAC00-\uD7AF' + '\uD7B0-\uD7FF' + '\uFFA1-\uFFDC',
        k = String.fromCharCode,
        l = '\u30A1-\u30FA\u30FC-\u30FE' + '\uFF66-\uFF9F' + '\uFF10-\uFF19\uFF21-\uFF3A' + '\uFF41-\uFF5A' + '\u3041-\u3096\u3099-\u309E' + '\u3400-\u4DBF' + '\u4E00-\u9FFF' + k(173824) + '-' + k(177983) + k(177984) + '-' + k(178207) + k(194560) + '-' + k(195103) + '\u3003\u3005\u303B',
        m = i + j + l,
        n = '\u0041-\u005A\u0061-\u007A\u00AA\u00B5\u00BA\u00C0-\u00D6\u00D8-\u00F6' + '\u00F8-\u0241\u0250-\u02C1\u02C6-\u02D1\u02E0-\u02E4\u02EE\u037A\u0386' + '\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03F5\u03F7-\u0481' + '\u048A-\u04CE\u04D0-\u04F9\u0500-\u050F\u0531-\u0556\u0559\u0561-\u0587' + '\u05D0-\u05EA\u05F0-\u05F2\u0621-\u063A\u0640-\u064A\u066E-\u066F' + '\u0671-\u06D3\u06D5\u06E5-\u06E6\u06EE-\u06EF\u06FA-\u06FC\u06FF\u0710' + '\u0712-\u072F\u074D-\u076D\u0780-\u07A5\u07B1\u0904-\u0939\u093D\u0950' + '\u0958-\u0961\u097D\u0985-\u098C\u098F-\u0990\u0993-\u09A8\u09AA-\u09B0' + '\u09B2\u09B6-\u09B9\u09BD\u09CE\u09DC-\u09DD\u09DF-\u09E1\u09F0-\u09F1' + '\u0A05-\u0A0A\u0A0F-\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32-\u0A33' + '\u0A35-\u0A36\u0A38-\u0A39\u0A59-\u0A5C\u0A5E\u0A72-\u0A74\u0A85-\u0A8D' + '\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2-\u0AB3\u0AB5-\u0AB9\u0ABD' + '\u0AD0\u0AE0-\u0AE1\u0B05-\u0B0C\u0B0F-\u0B10\u0B13-\u0B28\u0B2A-\u0B30' + '\u0B32-\u0B33\u0B35-\u0B39\u0B3D\u0B5C-\u0B5D\u0B5F-\u0B61\u0B71\u0B83' + '\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99-\u0B9A\u0B9C\u0B9E-\u0B9F' + '\u0BA3-\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB9\u0C05-\u0C0C\u0C0E-\u0C10' + '\u0C12-\u0C28\u0C2A-\u0C33\u0C35-\u0C39\u0C60-\u0C61\u0C85-\u0C8C' + '\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CBD\u0CDE' + '\u0CE0-\u0CE1\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D28\u0D2A-\u0D39' + '\u0D60-\u0D61\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6' + '\u0E01-\u0E30\u0E32-\u0E33\u0E40-\u0E46\u0E81-\u0E82\u0E84\u0E87-\u0E88' + '\u0E8A\u0E8D\u0E94-\u0E97\u0E99-\u0E9F\u0EA1-\u0EA3\u0EA5\u0EA7' + '\u0EAA-\u0EAB\u0EAD-\u0EB0\u0EB2-\u0EB3\u0EBD\u0EC0-\u0EC4\u0EC6' + '\u0EDC-\u0EDD\u0F00\u0F40-\u0F47\u0F49-\u0F6A\u0F88-\u0F8B\u1000-\u1021' + '\u1023-\u1027\u1029-\u102A\u1050-\u1055\u10A0-\u10C5\u10D0-\u10FA\u10FC' + '\u1100-\u1159\u115F-\u11A2\u11A8-\u11F9\u1200-\u1248\u124A-\u124D' + '\u1250-\u1256\u1258\u125A-\u125D\u1260-\u1288\u128A-\u128D\u1290-\u12B0' + '\u12B2-\u12B5\u12B8-\u12BE\u12C0\u12C2-\u12C5\u12C8-\u12D6\u12D8-\u1310' + '\u1312-\u1315\u1318-\u135A\u1380-\u138F\u13A0-\u13F4\u1401-\u166C' + '\u166F-\u1676\u1681-\u169A\u16A0-\u16EA\u1700-\u170C\u170E-\u1711' + '\u1720-\u1731\u1740-\u1751\u1760-\u176C\u176E-\u1770\u1780-\u17B3\u17D7' + '\u17DC\u1820-\u1877\u1880-\u18A8\u1900-\u191C\u1950-\u196D\u1970-\u1974' + '\u1980-\u19A9\u19C1-\u19C7\u1A00-\u1A16\u1D00-\u1DBF\u1E00-\u1E9B' + '\u1EA0-\u1EF9\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D' + '\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC' + '\u1FBE\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB\u1FE0-\u1FEC' + '\u1FF2-\u1FF4\u1FF6-\u1FFC\u2071\u207F\u2090-\u2094\u2102\u2107' + '\u210A-\u2113\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D' + '\u212F-\u2131\u2133-\u2139\u213C-\u213F\u2145-\u2149\u2C00-\u2C2E' + '\u2C30-\u2C5E\u2C80-\u2CE4\u2D00-\u2D25\u2D30-\u2D65\u2D6F\u2D80-\u2D96' + '\u2DA0-\u2DA6\u2DA8-\u2DAE\u2DB0-\u2DB6\u2DB8-\u2DBE\u2DC0-\u2DC6' + '\u2DC8-\u2DCE\u2DD0-\u2DD6\u2DD8-\u2DDE\u3005-\u3006\u3031-\u3035' + '\u303B-\u303C\u3041-\u3096\u309D-\u309F\u30A1-\u30FA\u30FC-\u30FF' + '\u3105-\u312C\u3131-\u318E\u31A0-\u31B7\u31F0-\u31FF\u3400-\u4DB5' + '\u4E00-\u9FBB\uA000-\uA48C\uA800-\uA801\uA803-\uA805\uA807-\uA80A' + '\uA80C-\uA822\uAC00-\uD7A3\uF900-\uFA2D\uFA30-\uFA6A\uFA70-\uFAD9' + '\uFB00-\uFB06\uFB13-\uFB17\uFB1D\uFB1F-\uFB28\uFB2A-\uFB36\uFB38-\uFB3C' + '\uFB3E\uFB40-\uFB41\uFB43-\uFB44\uFB46-\uFBB1\uFBD3-\uFD3D\uFD50-\uFD8F' + '\uFD92-\uFDC7\uFDF0-\uFDFB\uFE70-\uFE74\uFE76-\uFEFC\uFF21-\uFF3A' + '\uFF41-\uFF5A\uFF66-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7' + '\uFFDA-\uFFDC',
        o = '\u0300-\u036F\u0483-\u0486\u0591-\u05B9\u05BB-\u05BD\u05BF' + '\u05C1-\u05C2\u05C4-\u05C5\u05C7\u0610-\u0615\u064B-\u065E\u0670' + '\u06D6-\u06DC\u06DF-\u06E4\u06E7-\u06E8\u06EA-\u06ED\u0711\u0730-\u074A' + '\u07A6-\u07B0\u0901-\u0903\u093C\u093E-\u094D\u0951-\u0954\u0962-\u0963' + '\u0981-\u0983\u09BC\u09BE-\u09C4\u09C7-\u09C8\u09CB-\u09CD\u09D7' + '\u09E2-\u09E3\u0A01-\u0A03\u0A3C\u0A3E-\u0A42\u0A47-\u0A48\u0A4B-\u0A4D' + '\u0A70-\u0A71\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD' + '\u0AE2-\u0AE3\u0B01-\u0B03\u0B3C\u0B3E-\u0B43\u0B47-\u0B48\u0B4B-\u0B4D' + '\u0B56-\u0B57\u0B82\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7' + '\u0C01-\u0C03\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55-\u0C56' + '\u0C82-\u0C83\u0CBC\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5-\u0CD6' + '\u0D02-\u0D03\u0D3E-\u0D43\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0D82-\u0D83' + '\u0DCA\u0DCF-\u0DD4\u0DD6\u0DD8-\u0DDF\u0DF2-\u0DF3\u0E31\u0E34-\u0E3A' + '\u0E47-\u0E4E\u0EB1\u0EB4-\u0EB9\u0EBB-\u0EBC\u0EC8-\u0ECD\u0F18-\u0F19' + '\u0F35\u0F37\u0F39\u0F3E-\u0F3F\u0F71-\u0F84\u0F86-\u0F87\u0F90-\u0F97' + '\u0F99-\u0FBC\u0FC6\u102C-\u1032\u1036-\u1039\u1056-\u1059\u135F' + '\u1712-\u1714\u1732-\u1734\u1752-\u1753\u1772-\u1773\u17B6-\u17D3\u17DD' + '\u180B-\u180D\u18A9\u1920-\u192B\u1930-\u193B\u19B0-\u19C0\u19C8-\u19C9' + '\u1A17-\u1A1B\u1DC0-\u1DC3\u20D0-\u20DC\u20E1\u20E5-\u20EB\u302A-\u302F' + '\u3099-\u309A\uA802\uA806\uA80B\uA823-\uA827\uFB1E\uFE00-\uFE0F' + '\uFE20-\uFE23',
        p = '\u0030-\u0039\u0660-\u0669\u06F0-\u06F9\u0966-\u096F\u09E6-\u09EF' + '\u0A66-\u0A6F\u0AE6-\u0AEF\u0B66-\u0B6F\u0BE6-\u0BEF\u0C66-\u0C6F' + '\u0CE6-\u0CEF\u0D66-\u0D6F\u0E50-\u0E59\u0ED0-\u0ED9\u0F20-\u0F29' + '\u1040-\u1049\u17E0-\u17E9\u1810-\u1819\u1946-\u194F\u19D0-\u19D9' + '\uFF10-\uFF19',
        q = n + o + m,
        r = p + '_',
        s = q + r,
        t = '[' + q + ']',
        u = '[' + s + ']',
        v = '^|$|[^&/' + s + ']',
        w = '[#\\uFF03]',
        x = '(' + v + ')(' + w + ')(' + u + '*' + t + u + '*)';
    return x;
}

 

整理一下放到 Demo 上,大概就是這個 Feel,可以自行輸入測試:

See the Pen Hashtag extracter by Lay (@Brownsugar) on CodePen.

 

因為主要是後端需要,所以把它翻譯成 PHP:

function getPattern() {
	$a = '\xc0-\xd6'.'\xd8-\xf6'.'\xf8-\xff'.'\x{0100}-\x{024f}'.'\x{0253}-\x{0254}'.'\x{0256}-\x{0257}'.'\x{0259}'.'\x{025b}'.'\x{0263}'.'\x{0268}'.'\x{026f}'.'\x{0272}'.'\x{0289}'.'\x{028b}'.'\x{02bb}'.'\x{0300}-\x{036f}'.'\x{1e00}-\x{1eff}';
	$b = '\x{0400}-\x{04ff}'.'\x{0500}-\x{0527}'.'\x{2de0}-\x{2dff}'.'\x{a640}-\x{a69f}'.'\x{0591}-\x{05bf}'.'\x{05c1}-\x{05c2}'.'\x{05c4}-\x{05c5}'.'\x{05c7}'.'\x{05d0}-\x{05ea}'.'\x{05f0}-\x{05f4}'.'\x{fb12}-\x{fb28}'.'\x{fb2a}-\x{fb36}'.'\x{fb38}-\x{fb3c}'.'\x{fb3e}'.'\x{fb40}-\x{fb41}'.'\x{fb43}-\x{fb44}'.'\x{fb46}-\x{fb4f}'.'\x{0610}-\x{061a}'.'\x{0620}-\x{065f}'.'\x{066e}-\x{06d3}'.'\x{06d5}-\x{06dc}'.'\x{06de}-\x{06e8}'.'\x{06ea}-\x{06ef}'.'\x{06fa}-\x{06fc}'.'\x{06ff}'.'\x{0750}-\x{077f}'.'\x{08a0}'.'\x{08a2}-\x{08ac}'.'\x{08e4}-\x{08fe}'.'\x{fb50}-\x{fbb1}'.'\x{fbd3}-\x{fd3d}'.'\x{fd50}-\x{fd8f}'.'\x{fd92}-\x{fdc7}'.'\x{fdf0}-\x{fdfb}'.'\x{fe70}-\x{fe74}'.'\x{fe76}-\x{fefc}'.'\x{200c}-\x{200c}'.'\x{0e01}-\x{0e3a}'.'\x{0e40}-\x{0e4e}'.'\x{1100}-\x{11ff}'.'\x{3130}-\x{3185}'.'\x{A960}-\x{A97F}'.'\x{AC00}-\x{D7AF}'.'\x{D7B0}-\x{D7FF}'.'\x{FFA1}-\x{FFDC}';
	$c = '\x{30A1}-\x{30FA}\x{30FC}-\x{30FE}'.'\x{FF66}-\x{FF9F}'.'\x{FF10}-\x{FF19}\x{FF21}-\x{FF3A}'.'\x{FF41}-\x{FF5A}'.'\x{3041}-\x{3096}\x{3099}-\x{309E}'.'\x{3400}-\x{4DBF}'.'\x{4E00}-\x{9FFF}'.unichr(173824).'-'.unichr(177983).unichr(177984).'-'.unichr(178207).unichr(194560).'-'.unichr(195103).'\x{3003}\x{3005}\x{303B}';
	$d = $a.$b.$c;
	$e = '\x{0041}-\x{005A}\x{0061}-\x{007A}\x{00AA}\x{00B5}\x{00BA}\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}'.'\x{00F8}-\x{0241}\x{0250}-\x{02C1}\x{02C6}-\x{02D1}\x{02E0}-\x{02E4}\x{02EE}\x{037A}\x{0386}'.'\x{0388}-\x{038A}\x{038C}\x{038E}-\x{03A1}\x{03A3}-\x{03CE}\x{03D0}-\x{03F5}\x{03F7}-\x{0481}'.'\x{048A}-\x{04CE}\x{04D0}-\x{04F9}\x{0500}-\x{050F}\x{0531}-\x{0556}\x{0559}\x{0561}-\x{0587}'.'\x{05D0}-\x{05EA}\x{05F0}-\x{05F2}\x{0621}-\x{063A}\x{0640}-\x{064A}\x{066E}-\x{066F}'.'\x{0671}-\x{06D3}\x{06D5}\x{06E5}-\x{06E6}\x{06EE}-\x{06EF}\x{06FA}-\x{06FC}\x{06FF}\x{0710}'.'\x{0712}-\x{072F}\x{074D}-\x{076D}\x{0780}-\x{07A5}\x{07B1}\x{0904}-\x{0939}\x{093D}\x{0950}'.'\x{0958}-\x{0961}\x{097D}\x{0985}-\x{098C}\x{098F}-\x{0990}\x{0993}-\x{09A8}\x{09AA}-\x{09B0}'.'\x{09B2}\x{09B6}-\x{09B9}\x{09BD}\x{09CE}\x{09DC}-\x{09DD}\x{09DF}-\x{09E1}\x{09F0}-\x{09F1}'.'\x{0A05}-\x{0A0A}\x{0A0F}-\x{0A10}\x{0A13}-\x{0A28}\x{0A2A}-\x{0A30}\x{0A32}-\x{0A33}'.'\x{0A35}-\x{0A36}\x{0A38}-\x{0A39}\x{0A59}-\x{0A5C}\x{0A5E}\x{0A72}-\x{0A74}\x{0A85}-\x{0A8D}'.'\x{0A8F}-\x{0A91}\x{0A93}-\x{0AA8}\x{0AAA}-\x{0AB0}\x{0AB2}-\x{0AB3}\x{0AB5}-\x{0AB9}\x{0ABD}'.'\x{0AD0}\x{0AE0}-\x{0AE1}\x{0B05}-\x{0B0C}\x{0B0F}-\x{0B10}\x{0B13}-\x{0B28}\x{0B2A}-\x{0B30}'.'\x{0B32}-\x{0B33}\x{0B35}-\x{0B39}\x{0B3D}\x{0B5C}-\x{0B5D}\x{0B5F}-\x{0B61}\x{0B71}\x{0B83}'.'\x{0B85}-\x{0B8A}\x{0B8E}-\x{0B90}\x{0B92}-\x{0B95}\x{0B99}-\x{0B9A}\x{0B9C}\x{0B9E}-\x{0B9F}'.'\x{0BA3}-\x{0BA4}\x{0BA8}-\x{0BAA}\x{0BAE}-\x{0BB9}\x{0C05}-\x{0C0C}\x{0C0E}-\x{0C10}'.'\x{0C12}-\x{0C28}\x{0C2A}-\x{0C33}\x{0C35}-\x{0C39}\x{0C60}-\x{0C61}\x{0C85}-\x{0C8C}'.'\x{0C8E}-\x{0C90}\x{0C92}-\x{0CA8}\x{0CAA}-\x{0CB3}\x{0CB5}-\x{0CB9}\x{0CBD}\x{0CDE}'.'\x{0CE0}-\x{0CE1}\x{0D05}-\x{0D0C}\x{0D0E}-\x{0D10}\x{0D12}-\x{0D28}\x{0D2A}-\x{0D39}'.'\x{0D60}-\x{0D61}\x{0D85}-\x{0D96}\x{0D9A}-\x{0DB1}\x{0DB3}-\x{0DBB}\x{0DBD}\x{0DC0}-\x{0DC6}'.'\x{0E01}-\x{0E30}\x{0E32}-\x{0E33}\x{0E40}-\x{0E46}\x{0E81}-\x{0E82}\x{0E84}\x{0E87}-\x{0E88}'.'\x{0E8A}\x{0E8D}\x{0E94}-\x{0E97}\x{0E99}-\x{0E9F}\x{0EA1}-\x{0EA3}\x{0EA5}\x{0EA7}'.'\x{0EAA}-\x{0EAB}\x{0EAD}-\x{0EB0}\x{0EB2}-\x{0EB3}\x{0EBD}\x{0EC0}-\x{0EC4}\x{0EC6}'.'\x{0EDC}-\x{0EDD}\x{0F00}\x{0F40}-\x{0F47}\x{0F49}-\x{0F6A}\x{0F88}-\x{0F8B}\x{1000}-\x{1021}'.'\x{1023}-\x{1027}\x{1029}-\x{102A}\x{1050}-\x{1055}\x{10A0}-\x{10C5}\x{10D0}-\x{10FA}\x{10FC}'.'\x{1100}-\x{1159}\x{115F}-\x{11A2}\x{11A8}-\x{11F9}\x{1200}-\x{1248}\x{124A}-\x{124D}'.'\x{1250}-\x{1256}\x{1258}\x{125A}-\x{125D}\x{1260}-\x{1288}\x{128A}-\x{128D}\x{1290}-\x{12B0}'.'\x{12B2}-\x{12B5}\x{12B8}-\x{12BE}\x{12C0}\x{12C2}-\x{12C5}\x{12C8}-\x{12D6}\x{12D8}-\x{1310}'.'\x{1312}-\x{1315}\x{1318}-\x{135A}\x{1380}-\x{138F}\x{13A0}-\x{13F4}\x{1401}-\x{166C}'.'\x{166F}-\x{1676}\x{1681}-\x{169A}\x{16A0}-\x{16EA}\x{1700}-\x{170C}\x{170E}-\x{1711}'.'\x{1720}-\x{1731}\x{1740}-\x{1751}\x{1760}-\x{176C}\x{176E}-\x{1770}\x{1780}-\x{17B3}\x{17D7}'.'\x{17DC}\x{1820}-\x{1877}\x{1880}-\x{18A8}\x{1900}-\x{191C}\x{1950}-\x{196D}\x{1970}-\x{1974}'.'\x{1980}-\x{19A9}\x{19C1}-\x{19C7}\x{1A00}-\x{1A16}\x{1D00}-\x{1DBF}\x{1E00}-\x{1E9B}'.'\x{1EA0}-\x{1EF9}\x{1F00}-\x{1F15}\x{1F18}-\x{1F1D}\x{1F20}-\x{1F45}\x{1F48}-\x{1F4D}'.'\x{1F50}-\x{1F57}\x{1F59}\x{1F5B}\x{1F5D}\x{1F5F}-\x{1F7D}\x{1F80}-\x{1FB4}\x{1FB6}-\x{1FBC}'.'\x{1FBE}\x{1FC2}-\x{1FC4}\x{1FC6}-\x{1FCC}\x{1FD0}-\x{1FD3}\x{1FD6}-\x{1FDB}\x{1FE0}-\x{1FEC}'.'\x{1FF2}-\x{1FF4}\x{1FF6}-\x{1FFC}\x{2071}\x{207F}\x{2090}-\x{2094}\x{2102}\x{2107}'.'\x{210A}-\x{2113}\x{2115}\x{2119}-\x{211D}\x{2124}\x{2126}\x{2128}\x{212A}-\x{212D}'.'\x{212F}-\x{2131}\x{2133}-\x{2139}\x{213C}-\x{213F}\x{2145}-\x{2149}\x{2C00}-\x{2C2E}'.'\x{2C30}-\x{2C5E}\x{2C80}-\x{2CE4}\x{2D00}-\x{2D25}\x{2D30}-\x{2D65}\x{2D6F}\x{2D80}-\x{2D96}'.'\x{2DA0}-\x{2DA6}\x{2DA8}-\x{2DAE}\x{2DB0}-\x{2DB6}\x{2DB8}-\x{2DBE}\x{2DC0}-\x{2DC6}'.'\x{2DC8}-\x{2DCE}\x{2DD0}-\x{2DD6}\x{2DD8}-\x{2DDE}\x{3005}-\x{3006}\x{3031}-\x{3035}'.'\x{303B}-\x{303C}\x{3041}-\x{3096}\x{309D}-\x{309F}\x{30A1}-\x{30FA}\x{30FC}-\x{30FF}'.'\x{3105}-\x{312C}\x{3131}-\x{318E}\x{31A0}-\x{31B7}\x{31F0}-\x{31FF}\x{3400}-\x{4DB5}'.'\x{4E00}-\x{9FBB}\x{A000}-\x{A48C}\x{A800}-\x{A801}\x{A803}-\x{A805}\x{A807}-\x{A80A}'.'\x{A80C}-\x{A822}\x{AC00}-\x{D7A3}\x{F900}-\x{FA2D}\x{FA30}-\x{FA6A}\x{FA70}-\x{FAD9}'.'\x{FB00}-\x{FB06}\x{FB13}-\x{FB17}\x{FB1D}\x{FB1F}-\x{FB28}\x{FB2A}-\x{FB36}\x{FB38}-\x{FB3C}'.'\x{FB3E}\x{FB40}-\x{FB41}\x{FB43}-\x{FB44}\x{FB46}-\x{FBB1}\x{FBD3}-\x{FD3D}\x{FD50}-\x{FD8F}'.'\x{FD92}-\x{FDC7}\x{FDF0}-\x{FDFB}\x{FE70}-\x{FE74}\x{FE76}-\x{FEFC}\x{FF21}-\x{FF3A}'.'\x{FF41}-\x{FF5A}\x{FF66}-\x{FFBE}\x{FFC2}-\x{FFC7}\x{FFCA}-\x{FFCF}\x{FFD2}-\x{FFD7}'.'\x{FFDA}-\x{FFDC}';
	$f = '\x{0300}-\x{036F}\x{0483}-\x{0486}\x{0591}-\x{05B9}\x{05BB}-\x{05BD}\x{05BF}'.'\x{05C1}-\x{05C2}\x{05C4}-\x{05C5}\x{05C7}\x{0610}-\x{0615}\x{064B}-\x{065E}\x{0670}'.'\x{06D6}-\x{06DC}\x{06DF}-\x{06E4}\x{06E7}-\x{06E8}\x{06EA}-\x{06ED}\x{0711}\x{0730}-\x{074A}'.'\x{07A6}-\x{07B0}\x{0901}-\x{0903}\x{093C}\x{093E}-\x{094D}\x{0951}-\x{0954}\x{0962}-\x{0963}'.'\x{0981}-\x{0983}\x{09BC}\x{09BE}-\x{09C4}\x{09C7}-\x{09C8}\x{09CB}-\x{09CD}\x{09D7}'.'\x{09E2}-\x{09E3}\x{0A01}-\x{0A03}\x{0A3C}\x{0A3E}-\x{0A42}\x{0A47}-\x{0A48}\x{0A4B}-\x{0A4D}'.'\x{0A70}-\x{0A71}\x{0A81}-\x{0A83}\x{0ABC}\x{0ABE}-\x{0AC5}\x{0AC7}-\x{0AC9}\x{0ACB}-\x{0ACD}'.'\x{0AE2}-\x{0AE3}\x{0B01}-\x{0B03}\x{0B3C}\x{0B3E}-\x{0B43}\x{0B47}-\x{0B48}\x{0B4B}-\x{0B4D}'.'\x{0B56}-\x{0B57}\x{0B82}\x{0BBE}-\x{0BC2}\x{0BC6}-\x{0BC8}\x{0BCA}-\x{0BCD}\x{0BD7}'.'\x{0C01}-\x{0C03}\x{0C3E}-\x{0C44}\x{0C46}-\x{0C48}\x{0C4A}-\x{0C4D}\x{0C55}-\x{0C56}'.'\x{0C82}-\x{0C83}\x{0CBC}\x{0CBE}-\x{0CC4}\x{0CC6}-\x{0CC8}\x{0CCA}-\x{0CCD}\x{0CD5}-\x{0CD6}'.'\x{0D02}-\x{0D03}\x{0D3E}-\x{0D43}\x{0D46}-\x{0D48}\x{0D4A}-\x{0D4D}\x{0D57}\x{0D82}-\x{0D83}'.'\x{0DCA}\x{0DCF}-\x{0DD4}\x{0DD6}\x{0DD8}-\x{0DDF}\x{0DF2}-\x{0DF3}\x{0E31}\x{0E34}-\x{0E3A}'.'\x{0E47}-\x{0E4E}\x{0EB1}\x{0EB4}-\x{0EB9}\x{0EBB}-\x{0EBC}\x{0EC8}-\x{0ECD}\x{0F18}-\x{0F19}'.'\x{0F35}\x{0F37}\x{0F39}\x{0F3E}-\x{0F3F}\x{0F71}-\x{0F84}\x{0F86}-\x{0F87}\x{0F90}-\x{0F97}'.'\x{0F99}-\x{0FBC}\x{0FC6}\x{102C}-\x{1032}\x{1036}-\x{1039}\x{1056}-\x{1059}\x{135F}'.'\x{1712}-\x{1714}\x{1732}-\x{1734}\x{1752}-\x{1753}\x{1772}-\x{1773}\x{17B6}-\x{17D3}\x{17DD}'.'\x{180B}-\x{180D}\x{18A9}\x{1920}-\x{192B}\x{1930}-\x{193B}\x{19B0}-\x{19C0}\x{19C8}-\x{19C9}'.'\x{1A17}-\x{1A1B}\x{1DC0}-\x{1DC3}\x{20D0}-\x{20DC}\x{20E1}\x{20E5}-\x{20EB}\x{302A}-\x{302F}'.'\x{3099}-\x{309A}\x{A802}\x{A806}\x{A80B}\x{A823}-\x{A827}\x{FB1E}\x{FE00}-\x{FE0F}'.'\x{FE20}-\x{FE23}';
	$g = '\x{0030}-\x{0039}\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}'.'\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}'.'\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}'.'\x{1040}-\x{1049}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}'.'\x{FF10}-\x{FF19}';
	$h = $e.$f.$d;
	$i = $g.'_';
	$j = $h.$i;
	$k = '['.$h.']';
	$l = '['.$j.']';
	$m = '^|$|[^&\/'.$j.']';
	$n = '[#\x{FF03}]';
	$result = '('.$m.')('.$n.')('.$l.'*'.$k.$l.'*)';
	return $result;
}
function unichr($u) {
	return mb_convert_encoding('&#'.intval($u).';', 'UTF-8', 'HTML-ENTITIES');
}

 

大概的重點是:

  1. Javascript 的 String.fromCharCode() 功能在 PHP 上是用 chr()
  2. \uXXXX 要改成 \x{XXXX}

 

最後再使用 preg_match_all 抓取內容就行:

preg_match_all('/'.getPattern().'/u', $content, $matches);

想隨時追蹤最新資訊?歡迎使用 RSS 訂閱最新文章 »

您或許會感興趣的文章

隨機推薦

共有 0 則迴響

暫時沒有迴響,歡迎您率先發表!

發表迴響

*