charset detection, unicode for all

2015-03-10 19:48:47 +00:00 · 2015-03-10 19:48:47 +00:00 · 00bb2f4be7
parent bd8b1867aa
commit 00bb2f4be7
4 changed files with 36 additions and 7 deletions
--- a/README.rst
+++ b/README.rst
@ -4,17 +4,24 @@ APRS library for Python
 |Build Status| |Coverage Status|

 A tiny library for dealing with APRS. It can be used to connect and listen to the APRS-IS feed as well as upload.
-Parsing of packets is also possible, but the entire spec is not fully implemeneted yet.
+Parsing of packets is also possible, but the entire spec is not fully implemented yet.
 The following is supported:

 -  normal/compressed position reports
 -  objects
 -  mic-e position report
 -  messages (inc. telemetry, bulletins, etc)
-  base91 comment telemetry extention
-  altitude extention
+-  base91 comment telemetry extension
+-  altitude extension
 -  beacons

+Packets can often contain characters outside of 7-bit ASCII.
+``aprslib.parse()`` will attempt to guess the charset and return ``unicode`` strings using these steps and in that order:
+
+1. Attempt to decode string as ``utf-8``
+2. Attempt to guess the charset using ``chardet`` module (if installed), decode if confidence factor is sufficient
+3. Finally, decode as ``latin-1``
+
 Install
 -----------

@ -116,7 +123,7 @@ Here is a simple example:
    DEBUG:aprslib.parse:Parsed ok.
    ...

-Uploading packets to APRS-IS is posible through the ``sendall()`` method in ``IS``.
+Uploading packets to APRS-IS is possible through the ``sendall()`` method in ``IS``.
 The method assumes a single line/packet per call. The parameters may end with ``\r\n``, but it's not required.

 .. code:: python
--- a/aprslib/base91.py
+++ b/aprslib/base91.py
@ -29,8 +29,8 @@ def to_decimal(text):
    Takes a base91 char string and returns decimal
    """

-    if not isinstance(text, str):
-        raise TypeError("expected str")
+    if not isinstance(text, basestring):
+        raise TypeError("expected str or unicode, %s given" % type(text))

    if findall(r"[\x00-\x20\x7c-\xff]", text):
        raise ValueError("invalid character in sequence")
--- a/aprslib/parse.py
+++ b/aprslib/parse.py
@ -24,6 +24,16 @@ import math
 import logging
 from datetime import datetime

+try:
+    import chardet
+except ImportError:
+    # create fake chardet
+
+    class chardet:
+        @staticmethod
+        def detect(x):
+            return {'confidence': 0.0, 'encoding': 'windows-1252'}
+
 from .exceptions import (UnknownFormat, ParseError)
 from . import base91

@ -68,6 +78,17 @@ def parse(packet):
      * status message
    """

+    # attempt to detect encoding
+    try:
+        packet = packet.decode('utf-8')
+    except UnicodeDecodeError:
+        res = chardet.detect(packet)
+
+        if res['confidence'] > 0.7:
+            packet = packet.decode(res['encoding'])
+        else:
+            packet = packet.decode('latin-1')
+
    packet = packet.rstrip("\r\n")
    logger.debug("Parsing: %s", packet)

--- a/tests/test_base91.py
+++ b/tests/test_base91.py
@ -64,12 +64,13 @@ class b_ToDecimal(unittest.TestCase):
        # 91**3 = "!!!
        # etc
        testData += [[91**i, '"' + '!'*i] for i in xrange(20)]
+        testData += [[91**i, u'"' + u'!'*i] for i in xrange(20)]

        for expected, n in testData:
            self.assertEqual(expected, base91.to_decimal(n))

    def test_invalid_input_type(self):
-        testData = [-1, 0, 5, unicode('a'), None, ['d']]
+        testData = [-1, 0, 5, None, ['d']]

        for n in testData:
            self.assertRaises(TypeError, base91.to_decimal, n)