aboutsummaryrefslogtreecommitdiff
path: root/requests/packages/charade
diff options
context:
space:
mode:
Diffstat (limited to 'requests/packages/charade')
-rw-r--r--requests/packages/charade/__init__.py7
-rw-r--r--requests/packages/charade/chardistribution.py3
-rw-r--r--requests/packages/charade/compat.py12
-rw-r--r--requests/packages/charade/cp949prober.py44
-rw-r--r--requests/packages/charade/langcyrillicmodel.py2
-rw-r--r--requests/packages/charade/langgreekmodel.py2
-rw-r--r--requests/packages/charade/langhebrewmodel.py2
-rw-r--r--requests/packages/charade/langhungarianmodel.py2
-rw-r--r--requests/packages/charade/mbcsgroupprober.py2
-rw-r--r--requests/packages/charade/mbcssm.py40
-rw-r--r--requests/packages/charade/universaldetector.py15
11 files changed, 112 insertions, 19 deletions
diff --git a/requests/packages/charade/__init__.py b/requests/packages/charade/__init__.py
index 5d580b3..1aadf3e 100644
--- a/requests/packages/charade/__init__.py
+++ b/requests/packages/charade/__init__.py
@@ -15,10 +15,15 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-__version__ = "1.0.1"
+__version__ = "1.0.3"
+from sys import version_info
def detect(aBuf):
+ if ((version_info < (3, 0) and isinstance(aBuf, unicode)) or
+ (version_info >= (3, 0) and not isinstance(aBuf, bytes))):
+ raise ValueError('Expected a bytes object, not a unicode object')
+
from . import universaldetector
u = universaldetector.UniversalDetector()
u.reset()
diff --git a/requests/packages/charade/chardistribution.py b/requests/packages/charade/chardistribution.py
index 981bd1a..dfd3355 100644
--- a/requests/packages/charade/chardistribution.py
+++ b/requests/packages/charade/chardistribution.py
@@ -40,6 +40,7 @@ from .compat import wrap_ord
ENOUGH_DATA_THRESHOLD = 1024
SURE_YES = 0.99
SURE_NO = 0.01
+MINIMUM_DATA_THRESHOLD = 3
class CharDistributionAnalysis:
@@ -82,7 +83,7 @@ class CharDistributionAnalysis:
"""return confidence based on existing data"""
# if we didn't receive any character in our consideration range,
# return negative answer
- if self._mTotalChars <= 0:
+ if self._mTotalChars <= 0 or self._mFreqChars <= MINIMUM_DATA_THRESHOLD:
return SURE_NO
if self._mTotalChars != self._mFreqChars:
diff --git a/requests/packages/charade/compat.py b/requests/packages/charade/compat.py
index f86c46b..d9e30ad 100644
--- a/requests/packages/charade/compat.py
+++ b/requests/packages/charade/compat.py
@@ -18,9 +18,17 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
+import sys
+
+
+if sys.version_info < (3, 0):
+ base_str = (str, unicode)
+else:
+ base_str = (bytes, str)
+
def wrap_ord(a):
- if isinstance(a, str):
+ if sys.version_info < (3, 0) and isinstance(a, base_str):
return ord(a)
- elif isinstance(a, int):
+ else:
return a
diff --git a/requests/packages/charade/cp949prober.py b/requests/packages/charade/cp949prober.py
new file mode 100644
index 0000000..543501f
--- /dev/null
+++ b/requests/packages/charade/cp949prober.py
@@ -0,0 +1,44 @@
+######################## BEGIN LICENSE BLOCK ########################
+# The Original Code is mozilla.org code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 1998
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Mark Pilgrim - port to Python
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+# 02110-1301 USA
+######################### END LICENSE BLOCK #########################
+
+from .mbcharsetprober import MultiByteCharSetProber
+from .codingstatemachine import CodingStateMachine
+from .chardistribution import EUCKRDistributionAnalysis
+from .mbcssm import CP949SMModel
+
+
+class CP949Prober(MultiByteCharSetProber):
+ def __init__(self):
+ MultiByteCharSetProber.__init__(self)
+ self._mCodingSM = CodingStateMachine(CP949SMModel)
+ # NOTE: CP949 is a superset of EUC-KR, so the distribution should be
+ # not different.
+ self._mDistributionAnalyzer = EUCKRDistributionAnalysis()
+ self.reset()
+
+ def get_charset_name(self):
+ return "CP949"
diff --git a/requests/packages/charade/langcyrillicmodel.py b/requests/packages/charade/langcyrillicmodel.py
index 4b69c82..15e338f 100644
--- a/requests/packages/charade/langcyrillicmodel.py
+++ b/requests/packages/charade/langcyrillicmodel.py
@@ -25,8 +25,6 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-from . import constants
-
# KOI8-R language model
# Character Mapping Table:
KOI8R_CharToOrderMap = (
diff --git a/requests/packages/charade/langgreekmodel.py b/requests/packages/charade/langgreekmodel.py
index 78e9ce6..93241ce 100644
--- a/requests/packages/charade/langgreekmodel.py
+++ b/requests/packages/charade/langgreekmodel.py
@@ -25,8 +25,6 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-from . import constants
-
# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word
diff --git a/requests/packages/charade/langhebrewmodel.py b/requests/packages/charade/langhebrewmodel.py
index 4c6b3ce..d871324 100644
--- a/requests/packages/charade/langhebrewmodel.py
+++ b/requests/packages/charade/langhebrewmodel.py
@@ -27,8 +27,6 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-from . import constants
-
# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word
diff --git a/requests/packages/charade/langhungarianmodel.py b/requests/packages/charade/langhungarianmodel.py
index bd7f505..6f59c61 100644
--- a/requests/packages/charade/langhungarianmodel.py
+++ b/requests/packages/charade/langhungarianmodel.py
@@ -25,8 +25,6 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-from . import constants
-
# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word
diff --git a/requests/packages/charade/mbcsgroupprober.py b/requests/packages/charade/mbcsgroupprober.py
index ebe93d0..2f6f5e8 100644
--- a/requests/packages/charade/mbcsgroupprober.py
+++ b/requests/packages/charade/mbcsgroupprober.py
@@ -33,6 +33,7 @@ from .sjisprober import SJISProber
from .eucjpprober import EUCJPProber
from .gb2312prober import GB2312Prober
from .euckrprober import EUCKRProber
+from .cp949prober import CP949Prober
from .big5prober import Big5Prober
from .euctwprober import EUCTWProber
@@ -46,6 +47,7 @@ class MBCSGroupProber(CharSetGroupProber):
EUCJPProber(),
GB2312Prober(),
EUCKRProber(),
+ CP949Prober(),
Big5Prober(),
EUCTWProber()
]
diff --git a/requests/packages/charade/mbcssm.py b/requests/packages/charade/mbcssm.py
index 3a720c9..55c02f0 100644
--- a/requests/packages/charade/mbcssm.py
+++ b/requests/packages/charade/mbcssm.py
@@ -78,6 +78,46 @@ Big5SMModel = {'classTable': BIG5_cls,
'charLenTable': Big5CharLenTable,
'name': 'Big5'}
+# CP949
+
+CP949_cls = (
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,0,0, # 00 - 0f
+ 1,1,1,1,1,1,1,1, 1,1,1,0,1,1,1,1, # 10 - 1f
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 20 - 2f
+ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, # 30 - 3f
+ 1,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, # 40 - 4f
+ 4,4,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 50 - 5f
+ 1,5,5,5,5,5,5,5, 5,5,5,5,5,5,5,5, # 60 - 6f
+ 5,5,5,5,5,5,5,5, 5,5,5,1,1,1,1,1, # 70 - 7f
+ 0,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 80 - 8f
+ 6,6,6,6,6,6,6,6, 6,6,6,6,6,6,6,6, # 90 - 9f
+ 6,7,7,7,7,7,7,7, 7,7,7,7,7,8,8,8, # a0 - af
+ 7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7, # b0 - bf
+ 7,7,7,7,7,7,9,2, 2,3,2,2,2,2,2,2, # c0 - cf
+ 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # d0 - df
+ 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, # e0 - ef
+ 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,0, # f0 - ff
+)
+
+CP949_st = (
+#cls= 0 1 2 3 4 5 6 7 8 9 # previous state =
+ eError,eStart, 3,eError,eStart,eStart, 4, 5,eError, 6, # eStart
+ eError,eError,eError,eError,eError,eError,eError,eError,eError,eError, # eError
+ eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe, # eItsMe
+ eError,eError,eStart,eStart,eError,eError,eError,eStart,eStart,eStart, # 3
+ eError,eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 4
+ eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart,eStart, # 5
+ eError,eStart,eStart,eStart,eStart,eError,eError,eStart,eStart,eStart, # 6
+)
+
+CP949CharLenTable = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2)
+
+CP949SMModel = {'classTable': CP949_cls,
+ 'classFactor': 10,
+ 'stateTable': CP949_st,
+ 'charLenTable': CP949CharLenTable,
+ 'name': 'CP949'}
+
# EUC-JP
EUCJP_cls = (
diff --git a/requests/packages/charade/universaldetector.py b/requests/packages/charade/universaldetector.py
index adaae72..6175bfb 100644
--- a/requests/packages/charade/universaldetector.py
+++ b/requests/packages/charade/universaldetector.py
@@ -28,6 +28,7 @@
from . import constants
import sys
+import codecs
from .latin1prober import Latin1Prober # windows-1252
from .mbcsgroupprober import MBCSGroupProber # multi-byte character sets
from .sbcsgroupprober import SBCSGroupProber # single-byte character sets
@@ -70,31 +71,31 @@ class UniversalDetector:
if not self._mGotData:
# If the data starts with BOM, we know it is UTF
- if aBuf[:3] == '\xEF\xBB\xBF':
+ if aBuf[:3] == codecs.BOM:
# EF BB BF UTF-8 with BOM
self.result = {'encoding': "UTF-8", 'confidence': 1.0}
- elif aBuf[:4] == '\xFF\xFE\x00\x00':
+ elif aBuf[:4] == codecs.BOM_UTF32_LE:
# FF FE 00 00 UTF-32, little-endian BOM
self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
- elif aBuf[:4] == '\x00\x00\xFE\xFF':
+ elif aBuf[:4] == codecs.BOM_UTF32_BE:
# 00 00 FE FF UTF-32, big-endian BOM
self.result = {'encoding': "UTF-32BE", 'confidence': 1.0}
- elif aBuf[:4] == '\xFE\xFF\x00\x00':
+ elif aBuf[:4] == b'\xFE\xFF\x00\x00':
# FE FF 00 00 UCS-4, unusual octet order BOM (3412)
self.result = {
'encoding': "X-ISO-10646-UCS-4-3412",
'confidence': 1.0
}
- elif aBuf[:4] == '\x00\x00\xFF\xFE':
+ elif aBuf[:4] == b'\x00\x00\xFF\xFE':
# 00 00 FF FE UCS-4, unusual octet order BOM (2143)
self.result = {
'encoding': "X-ISO-10646-UCS-4-2143",
'confidence': 1.0
}
- elif aBuf[:2] == '\xFF\xFE':
+ elif aBuf[:2] == codecs.BOM_LE:
# FF FE UTF-16, little endian BOM
self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
- elif aBuf[:2] == '\xFE\xFF':
+ elif aBuf[:2] == codecs.BOM_BE:
# FE FF UTF-16, big endian BOM
self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}