From 00bb2f4be7d6cb5322253b109827e60a5f4d5d99 Mon Sep 17 00:00:00 2001 From: Rossen Georgiev Date: Tue, 10 Mar 2015 19:48:47 +0000 Subject: [PATCH] charset detection, unicode for all --- README.rst | 15 +++++++++++---- aprslib/base91.py | 4 ++-- aprslib/parse.py | 21 +++++++++++++++++++++ tests/test_base91.py | 3 ++- 4 files changed, 36 insertions(+), 7 deletions(-) diff --git a/README.rst b/README.rst index bb2d7a6..6a1428c 100644 --- a/README.rst +++ b/README.rst @@ -4,17 +4,24 @@ APRS library for Python |Build Status| |Coverage Status| A tiny library for dealing with APRS. It can be used to connect and listen to the APRS-IS feed as well as upload. -Parsing of packets is also possible, but the entire spec is not fully implemeneted yet. +Parsing of packets is also possible, but the entire spec is not fully implemented yet. The following is supported: - normal/compressed position reports - objects - mic-e position report - messages (inc. telemetry, bulletins, etc) -- base91 comment telemetry extention -- altitude extention +- base91 comment telemetry extension +- altitude extension - beacons +Packets can often contain characters outside of 7-bit ASCII. +``aprslib.parse()`` will attempt to guess the charset and return ``unicode`` strings using these steps and in that order: + +1. Attempt to decode string as ``utf-8`` +2. Attempt to guess the charset using ``chardet`` module (if installed), decode if confidence factor is sufficient +3. Finally, decode as ``latin-1`` + Install ----------- @@ -116,7 +123,7 @@ Here is a simple example: DEBUG:aprslib.parse:Parsed ok. ... -Uploading packets to APRS-IS is posible through the ``sendall()`` method in ``IS``. +Uploading packets to APRS-IS is possible through the ``sendall()`` method in ``IS``. The method assumes a single line/packet per call. The parameters may end with ``\r\n``, but it's not required. .. code:: python diff --git a/aprslib/base91.py b/aprslib/base91.py index b184170..15392c0 100644 --- a/aprslib/base91.py +++ b/aprslib/base91.py @@ -29,8 +29,8 @@ def to_decimal(text): Takes a base91 char string and returns decimal """ - if not isinstance(text, str): - raise TypeError("expected str") + if not isinstance(text, basestring): + raise TypeError("expected str or unicode, %s given" % type(text)) if findall(r"[\x00-\x20\x7c-\xff]", text): raise ValueError("invalid character in sequence") diff --git a/aprslib/parse.py b/aprslib/parse.py index 1020251..8ee2f1e 100644 --- a/aprslib/parse.py +++ b/aprslib/parse.py @@ -24,6 +24,16 @@ import math import logging from datetime import datetime +try: + import chardet +except ImportError: + # create fake chardet + + class chardet: + @staticmethod + def detect(x): + return {'confidence': 0.0, 'encoding': 'windows-1252'} + from .exceptions import (UnknownFormat, ParseError) from . import base91 @@ -68,6 +78,17 @@ def parse(packet): * status message """ + # attempt to detect encoding + try: + packet = packet.decode('utf-8') + except UnicodeDecodeError: + res = chardet.detect(packet) + + if res['confidence'] > 0.7: + packet = packet.decode(res['encoding']) + else: + packet = packet.decode('latin-1') + packet = packet.rstrip("\r\n") logger.debug("Parsing: %s", packet) diff --git a/tests/test_base91.py b/tests/test_base91.py index c013e1a..ba3cb6c 100644 --- a/tests/test_base91.py +++ b/tests/test_base91.py @@ -64,12 +64,13 @@ class b_ToDecimal(unittest.TestCase): # 91**3 = "!!! # etc testData += [[91**i, '"' + '!'*i] for i in xrange(20)] + testData += [[91**i, u'"' + u'!'*i] for i in xrange(20)] for expected, n in testData: self.assertEqual(expected, base91.to_decimal(n)) def test_invalid_input_type(self): - testData = [-1, 0, 5, unicode('a'), None, ['d']] + testData = [-1, 0, 5, None, ['d']] for n in testData: self.assertRaises(TypeError, base91.to_decimal, n)