charset detection, unicode for all

This commit is contained in:
Rossen Georgiev 2015-03-10 19:48:47 +00:00
parent bd8b1867aa
commit 00bb2f4be7
4 changed files with 36 additions and 7 deletions

View File

@ -4,17 +4,24 @@ APRS library for Python
|Build Status| |Coverage Status|
A tiny library for dealing with APRS. It can be used to connect and listen to the APRS-IS feed as well as upload.
Parsing of packets is also possible, but the entire spec is not fully implemeneted yet.
Parsing of packets is also possible, but the entire spec is not fully implemented yet.
The following is supported:
- normal/compressed position reports
- objects
- mic-e position report
- messages (inc. telemetry, bulletins, etc)
- base91 comment telemetry extention
- altitude extention
- base91 comment telemetry extension
- altitude extension
- beacons
Packets can often contain characters outside of 7-bit ASCII.
``aprslib.parse()`` will attempt to guess the charset and return ``unicode`` strings using these steps and in that order:
1. Attempt to decode string as ``utf-8``
2. Attempt to guess the charset using ``chardet`` module (if installed), decode if confidence factor is sufficient
3. Finally, decode as ``latin-1``
Install
-----------
@ -116,7 +123,7 @@ Here is a simple example:
DEBUG:aprslib.parse:Parsed ok.
...
Uploading packets to APRS-IS is posible through the ``sendall()`` method in ``IS``.
Uploading packets to APRS-IS is possible through the ``sendall()`` method in ``IS``.
The method assumes a single line/packet per call. The parameters may end with ``\r\n``, but it's not required.
.. code:: python

View File

@ -29,8 +29,8 @@ def to_decimal(text):
Takes a base91 char string and returns decimal
"""
if not isinstance(text, str):
raise TypeError("expected str")
if not isinstance(text, basestring):
raise TypeError("expected str or unicode, %s given" % type(text))
if findall(r"[\x00-\x20\x7c-\xff]", text):
raise ValueError("invalid character in sequence")

View File

@ -24,6 +24,16 @@ import math
import logging
from datetime import datetime
try:
import chardet
except ImportError:
# create fake chardet
class chardet:
@staticmethod
def detect(x):
return {'confidence': 0.0, 'encoding': 'windows-1252'}
from .exceptions import (UnknownFormat, ParseError)
from . import base91
@ -68,6 +78,17 @@ def parse(packet):
* status message
"""
# attempt to detect encoding
try:
packet = packet.decode('utf-8')
except UnicodeDecodeError:
res = chardet.detect(packet)
if res['confidence'] > 0.7:
packet = packet.decode(res['encoding'])
else:
packet = packet.decode('latin-1')
packet = packet.rstrip("\r\n")
logger.debug("Parsing: %s", packet)

View File

@ -64,12 +64,13 @@ class b_ToDecimal(unittest.TestCase):
# 91**3 = "!!!
# etc
testData += [[91**i, '"' + '!'*i] for i in xrange(20)]
testData += [[91**i, u'"' + u'!'*i] for i in xrange(20)]
for expected, n in testData:
self.assertEqual(expected, base91.to_decimal(n))
def test_invalid_input_type(self):
testData = [-1, 0, 5, unicode('a'), None, ['d']]
testData = [-1, 0, 5, None, ['d']]
for n in testData:
self.assertRaises(TypeError, base91.to_decimal, n)