aboutsummaryrefslogtreecommitdiff
path: root/requests/packages/chardet
diff options
context:
space:
mode:
Diffstat (limited to 'requests/packages/chardet')
-rw-r--r--requests/packages/chardet/__init__.py2
-rwxr-xr-xrequests/packages/chardet/chardetect.py64
-rw-r--r--requests/packages/chardet/jpcntx.py8
-rw-r--r--requests/packages/chardet/latin1prober.py6
-rw-r--r--requests/packages/chardet/mbcssm.py9
-rw-r--r--requests/packages/chardet/sjisprober.py2
-rw-r--r--requests/packages/chardet/universaldetector.py4
7 files changed, 67 insertions, 28 deletions
diff --git a/requests/packages/chardet/__init__.py b/requests/packages/chardet/__init__.py
index e4f0799..82c2a48 100644
--- a/requests/packages/chardet/__init__.py
+++ b/requests/packages/chardet/__init__.py
@@ -15,7 +15,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################
-__version__ = "2.2.1"
+__version__ = "2.3.0"
from sys import version_info
diff --git a/requests/packages/chardet/chardetect.py b/requests/packages/chardet/chardetect.py
index ecd0163..ffe892f 100755
--- a/requests/packages/chardet/chardetect.py
+++ b/requests/packages/chardet/chardetect.py
@@ -12,34 +12,68 @@ Example::
If no paths are provided, it takes its input from stdin.
"""
+
+from __future__ import absolute_import, print_function, unicode_literals
+
+import argparse
+import sys
from io import open
-from sys import argv, stdin
+from chardet import __version__
from chardet.universaldetector import UniversalDetector
-def description_of(file, name='stdin'):
- """Return a string describing the probable encoding of a file."""
+def description_of(lines, name='stdin'):
+ """
+ Return a string describing the probable encoding of a file or
+ list of strings.
+
+ :param lines: The lines to get the encoding of.
+ :type lines: Iterable of bytes
+ :param name: Name of file or collection of lines
+ :type name: str
+ """
u = UniversalDetector()
- for line in file:
+ for line in lines:
u.feed(line)
u.close()
result = u.result
if result['encoding']:
- return '%s: %s with confidence %s' % (name,
- result['encoding'],
- result['confidence'])
+ return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
+ result['confidence'])
else:
- return '%s: no result' % name
+ return '{0}: no result'.format(name)
-def main():
- if len(argv) <= 1:
- print(description_of(stdin))
- else:
- for path in argv[1:]:
- with open(path, 'rb') as f:
- print(description_of(f, path))
+def main(argv=None):
+ '''
+ Handles command line arguments and gets things started.
+
+ :param argv: List of arguments, as if specified on the command-line.
+ If None, ``sys.argv[1:]`` is used instead.
+ :type argv: list of str
+ '''
+ # Get command line arguments
+ parser = argparse.ArgumentParser(
+ description="Takes one or more file paths and reports their detected \
+ encodings",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ conflict_handler='resolve')
+ parser.add_argument('input',
+ help='File whose encoding we would like to determine.',
+ type=argparse.FileType('rb'), nargs='*',
+ default=[sys.stdin])
+ parser.add_argument('--version', action='version',
+ version='%(prog)s {0}'.format(__version__))
+ args = parser.parse_args(argv)
+
+ for f in args.input:
+ if f.isatty():
+ print("You are running chardetect interactively. Press " +
+ "CTRL-D twice at the start of a blank line to signal the " +
+ "end of your input. If you want help, run chardetect " +
+ "--help\n", file=sys.stderr)
+ print(description_of(f, f.name))
if __name__ == '__main__':
diff --git a/requests/packages/chardet/jpcntx.py b/requests/packages/chardet/jpcntx.py
index f7f69ba..59aeb6a 100644
--- a/requests/packages/chardet/jpcntx.py
+++ b/requests/packages/chardet/jpcntx.py
@@ -177,6 +177,12 @@ class JapaneseContextAnalysis:
return -1, 1
class SJISContextAnalysis(JapaneseContextAnalysis):
+ def __init__(self):
+ self.charset_name = "SHIFT_JIS"
+
+ def get_charset_name(self):
+ return self.charset_name
+
def get_order(self, aBuf):
if not aBuf:
return -1, 1
@@ -184,6 +190,8 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
first_char = wrap_ord(aBuf[0])
if ((0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC)):
charLen = 2
+ if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
+ self.charset_name = "CP932"
else:
charLen = 1
diff --git a/requests/packages/chardet/latin1prober.py b/requests/packages/chardet/latin1prober.py
index ad695f5..eef3573 100644
--- a/requests/packages/chardet/latin1prober.py
+++ b/requests/packages/chardet/latin1prober.py
@@ -129,11 +129,11 @@ class Latin1Prober(CharSetProber):
if total < 0.01:
confidence = 0.0
else:
- confidence = ((self._mFreqCounter[3] / total)
- - (self._mFreqCounter[1] * 20.0 / total))
+ confidence = ((self._mFreqCounter[3] - self._mFreqCounter[1] * 20.0)
+ / total)
if confidence < 0.0:
confidence = 0.0
# lower the confidence of latin1 so that other more accurate
# detector can take priority.
- confidence = confidence * 0.5
+ confidence = confidence * 0.73
return confidence
diff --git a/requests/packages/chardet/mbcssm.py b/requests/packages/chardet/mbcssm.py
index 3f93cfb..efe678c 100644
--- a/requests/packages/chardet/mbcssm.py
+++ b/requests/packages/chardet/mbcssm.py
@@ -353,7 +353,7 @@ SJIS_cls = (
2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,1, # 78 - 7f
- 3,3,3,3,3,3,3,3, # 80 - 87
+ 3,3,3,3,3,2,2,3, # 80 - 87
3,3,3,3,3,3,3,3, # 88 - 8f
3,3,3,3,3,3,3,3, # 90 - 97
3,3,3,3,3,3,3,3, # 98 - 9f
@@ -369,9 +369,8 @@ SJIS_cls = (
2,2,2,2,2,2,2,2, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,4,4,4, # e8 - ef
- 4,4,4,4,4,4,4,4, # f0 - f7
- 4,4,4,4,4,0,0,0 # f8 - ff
-)
+ 3,3,3,3,3,3,3,3, # f0 - f7
+ 3,3,3,3,3,0,0,0) # f8 - ff
SJIS_st = (
@@ -571,5 +570,3 @@ UTF8SMModel = {'classTable': UTF8_cls,
'stateTable': UTF8_st,
'charLenTable': UTF8CharLenTable,
'name': 'UTF-8'}
-
-# flake8: noqa
diff --git a/requests/packages/chardet/sjisprober.py b/requests/packages/chardet/sjisprober.py
index b173614..cd0e9e7 100644
--- a/requests/packages/chardet/sjisprober.py
+++ b/requests/packages/chardet/sjisprober.py
@@ -47,7 +47,7 @@ class SJISProber(MultiByteCharSetProber):
self._mContextAnalyzer.reset()
def get_charset_name(self):
- return "SHIFT_JIS"
+ return self._mContextAnalyzer.get_charset_name()
def feed(self, aBuf):
aLen = len(aBuf)
diff --git a/requests/packages/chardet/universaldetector.py b/requests/packages/chardet/universaldetector.py
index 9a03ad3..476522b 100644
--- a/requests/packages/chardet/universaldetector.py
+++ b/requests/packages/chardet/universaldetector.py
@@ -71,9 +71,9 @@ class UniversalDetector:
if not self._mGotData:
# If the data starts with BOM, we know it is UTF
- if aBuf[:3] == codecs.BOM:
+ if aBuf[:3] == codecs.BOM_UTF8:
# EF BB BF UTF-8 with BOM
- self.result = {'encoding': "UTF-8", 'confidence': 1.0}
+ self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
elif aBuf[:4] == codecs.BOM_UTF32_LE:
# FF FE 00 00 UTF-32, little-endian BOM
self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}