From 00bb2f4be7d6cb5322253b109827e60a5f4d5d99 Mon Sep 17 00:00:00 2001
From: Rossen Georgiev <zoorty@gmail.com>
Date: Tue, 10 Mar 2015 19:48:47 +0000
Subject: [PATCH] charset detection, unicode for all

---
 README.rst           | 15 +++++++++++----
 aprslib/base91.py    |  4 ++--
 aprslib/parse.py     | 21 +++++++++++++++++++++
 tests/test_base91.py |  3 ++-
 4 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/README.rst b/README.rst
index bb2d7a6..6a1428c 100644
--- a/README.rst
+++ b/README.rst
@@ -4,17 +4,24 @@ APRS library for Python
 |Build Status| |Coverage Status|
 
 A tiny library for dealing with APRS. It can be used to connect and listen to the APRS-IS feed as well as upload.
-Parsing of packets is also possible, but the entire spec is not fully implemeneted yet.
+Parsing of packets is also possible, but the entire spec is not fully implemented yet.
 The following is supported:
 
 -  normal/compressed position reports
 -  objects
 -  mic-e position report
 -  messages (inc. telemetry, bulletins, etc)
--  base91 comment telemetry extention
--  altitude extention
+-  base91 comment telemetry extension
+-  altitude extension
 -  beacons
 
+Packets can often contain characters outside of 7-bit ASCII.
+``aprslib.parse()`` will attempt to guess the charset and return ``unicode`` strings using these steps and in that order:
+
+1. Attempt to decode string as ``utf-8``
+2. Attempt to guess the charset using ``chardet`` module (if installed), decode if confidence factor is sufficient
+3. Finally, decode as ``latin-1``
+
 Install
 -----------
 
@@ -116,7 +123,7 @@ Here is a simple example:
     DEBUG:aprslib.parse:Parsed ok.
     ...
 
-Uploading packets to APRS-IS is posible through the ``sendall()`` method in ``IS``.
+Uploading packets to APRS-IS is possible through the ``sendall()`` method in ``IS``.
 The method assumes a single line/packet per call. The parameters may end with ``\r\n``, but it's not required.
 
 .. code:: python
diff --git a/aprslib/base91.py b/aprslib/base91.py
index b184170..15392c0 100644
--- a/aprslib/base91.py
+++ b/aprslib/base91.py
@@ -29,8 +29,8 @@ def to_decimal(text):
     Takes a base91 char string and returns decimal
     """
 
-    if not isinstance(text, str):
-        raise TypeError("expected str")
+    if not isinstance(text, basestring):
+        raise TypeError("expected str or unicode, %s given" % type(text))
 
     if findall(r"[\x00-\x20\x7c-\xff]", text):
         raise ValueError("invalid character in sequence")
diff --git a/aprslib/parse.py b/aprslib/parse.py
index 1020251..8ee2f1e 100644
--- a/aprslib/parse.py
+++ b/aprslib/parse.py
@@ -24,6 +24,16 @@ import math
 import logging
 from datetime import datetime
 
+try:
+    import chardet
+except ImportError:
+    # create fake chardet
+
+    class chardet:
+        @staticmethod
+        def detect(x):
+            return {'confidence': 0.0, 'encoding': 'windows-1252'}
+
 from .exceptions import (UnknownFormat, ParseError)
 from . import base91
 
@@ -68,6 +78,17 @@ def parse(packet):
       * status message
     """
 
+    # attempt to detect encoding
+    try:
+        packet = packet.decode('utf-8')
+    except UnicodeDecodeError:
+        res = chardet.detect(packet)
+
+        if res['confidence'] > 0.7:
+            packet = packet.decode(res['encoding'])
+        else:
+            packet = packet.decode('latin-1')
+
     packet = packet.rstrip("\r\n")
     logger.debug("Parsing: %s", packet)
 
diff --git a/tests/test_base91.py b/tests/test_base91.py
index c013e1a..ba3cb6c 100644
--- a/tests/test_base91.py
+++ b/tests/test_base91.py
@@ -64,12 +64,13 @@ class b_ToDecimal(unittest.TestCase):
         # 91**3 = "!!!
         # etc
         testData += [[91**i, '"' + '!'*i] for i in xrange(20)]
+        testData += [[91**i, u'"' + u'!'*i] for i in xrange(20)]
 
         for expected, n in testData:
             self.assertEqual(expected, base91.to_decimal(n))
 
     def test_invalid_input_type(self):
-        testData = [-1, 0, 5, unicode('a'), None, ['d']]
+        testData = [-1, 0, 5, None, ['d']]
 
         for n in testData:
             self.assertRaises(TypeError, base91.to_decimal, n)