charset detection, unicode for all
This commit is contained in:
parent
bd8b1867aa
commit
00bb2f4be7
15
README.rst
15
README.rst
|
|
@ -4,17 +4,24 @@ APRS library for Python
|
|||
|Build Status| |Coverage Status|
|
||||
|
||||
A tiny library for dealing with APRS. It can be used to connect and listen to the APRS-IS feed as well as upload.
|
||||
Parsing of packets is also possible, but the entire spec is not fully implemeneted yet.
|
||||
Parsing of packets is also possible, but the entire spec is not fully implemented yet.
|
||||
The following is supported:
|
||||
|
||||
- normal/compressed position reports
|
||||
- objects
|
||||
- mic-e position report
|
||||
- messages (inc. telemetry, bulletins, etc)
|
||||
- base91 comment telemetry extention
|
||||
- altitude extention
|
||||
- base91 comment telemetry extension
|
||||
- altitude extension
|
||||
- beacons
|
||||
|
||||
Packets can often contain characters outside of 7-bit ASCII.
|
||||
``aprslib.parse()`` will attempt to guess the charset and return ``unicode`` strings using these steps and in that order:
|
||||
|
||||
1. Attempt to decode string as ``utf-8``
|
||||
2. Attempt to guess the charset using ``chardet`` module (if installed), decode if confidence factor is sufficient
|
||||
3. Finally, decode as ``latin-1``
|
||||
|
||||
Install
|
||||
-----------
|
||||
|
||||
|
|
@ -116,7 +123,7 @@ Here is a simple example:
|
|||
DEBUG:aprslib.parse:Parsed ok.
|
||||
...
|
||||
|
||||
Uploading packets to APRS-IS is posible through the ``sendall()`` method in ``IS``.
|
||||
Uploading packets to APRS-IS is possible through the ``sendall()`` method in ``IS``.
|
||||
The method assumes a single line/packet per call. The parameters may end with ``\r\n``, but it's not required.
|
||||
|
||||
.. code:: python
|
||||
|
|
|
|||
|
|
@ -29,8 +29,8 @@ def to_decimal(text):
|
|||
Takes a base91 char string and returns decimal
|
||||
"""
|
||||
|
||||
if not isinstance(text, str):
|
||||
raise TypeError("expected str")
|
||||
if not isinstance(text, basestring):
|
||||
raise TypeError("expected str or unicode, %s given" % type(text))
|
||||
|
||||
if findall(r"[\x00-\x20\x7c-\xff]", text):
|
||||
raise ValueError("invalid character in sequence")
|
||||
|
|
|
|||
|
|
@ -24,6 +24,16 @@ import math
|
|||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
import chardet
|
||||
except ImportError:
|
||||
# create fake chardet
|
||||
|
||||
class chardet:
|
||||
@staticmethod
|
||||
def detect(x):
|
||||
return {'confidence': 0.0, 'encoding': 'windows-1252'}
|
||||
|
||||
from .exceptions import (UnknownFormat, ParseError)
|
||||
from . import base91
|
||||
|
||||
|
|
@ -68,6 +78,17 @@ def parse(packet):
|
|||
* status message
|
||||
"""
|
||||
|
||||
# attempt to detect encoding
|
||||
try:
|
||||
packet = packet.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
res = chardet.detect(packet)
|
||||
|
||||
if res['confidence'] > 0.7:
|
||||
packet = packet.decode(res['encoding'])
|
||||
else:
|
||||
packet = packet.decode('latin-1')
|
||||
|
||||
packet = packet.rstrip("\r\n")
|
||||
logger.debug("Parsing: %s", packet)
|
||||
|
||||
|
|
|
|||
|
|
@ -64,12 +64,13 @@ class b_ToDecimal(unittest.TestCase):
|
|||
# 91**3 = "!!!
|
||||
# etc
|
||||
testData += [[91**i, '"' + '!'*i] for i in xrange(20)]
|
||||
testData += [[91**i, u'"' + u'!'*i] for i in xrange(20)]
|
||||
|
||||
for expected, n in testData:
|
||||
self.assertEqual(expected, base91.to_decimal(n))
|
||||
|
||||
def test_invalid_input_type(self):
|
||||
testData = [-1, 0, 5, unicode('a'), None, ['d']]
|
||||
testData = [-1, 0, 5, None, ['d']]
|
||||
|
||||
for n in testData:
|
||||
self.assertRaises(TypeError, base91.to_decimal, n)
|
||||
|
|
|
|||
Loading…
Reference in New Issue