Source code for franz.openrdf.util.strings
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# pylint: disable-msg=C0103
################################################################################
# Copyright (c) 2006-2017 Franz Inc.
# All rights reserved. This program and the accompanying materials are
# made available under the terms of the MIT License which accompanies
# this distribution, and is available at http://opensource.org/licenses/MIT
################################################################################
from __future__ import absolute_import
from __future__ import unicode_literals
from future.builtins import chr
from future.types import newbytes
from future.utils import native_str, isnewbytes
from past.builtins import unicode
import ast
import sys
"""
A strings utility module for helper functions.
"""
import re
###############################################################################
## Canonical NTriples encoding
###############################################################################
[docs]def encode_ntriple_string(string):
"""
Return a unicode string escaped according to N-Triples
canonical encoding rules.
"""
if not isinstance(string, unicode):
string = unicode(string, 'utf-8')
for char, replacement in ESCAPES:
string = string.replace(char, replacement)
return string
ESCAPES = [
# Replacements will be performed sequentially, so backslash
# must be the first character on the list
(chr(0x5C), r'\\'),
(chr(0x0A), r'\n'),
(chr(0x0D), r'\r'),
(chr(0x22), r'\"'),
]
uri_escaped_chars = re.compile(r'[\x00-\x20<>"{}|^`\\]')
[docs]def uri_escape_match(match):
"""
Converts a Match object representing a single character
into an ntriple escape sequence.
"""
code = ord(match.group())
if code <= 0xffff:
return '\\u%04x' % code
else:
return '\\U%08x' % code
[docs]def encode_ntriple_uri(uri):
"""
Converts a string URI to ntriples by adding angle brackets
and escaping special characters.
"""
return '<' + uri_escaped_chars.sub(uri_escape_match, uri) + '>'
[docs]def ntriples_unescape(text):
"""
Decodes ntriples escape sequences in a string.
Actually decodes a superset of said sequences.
"""
if text is None:
return None
return ast.literal_eval(u'u"' + text + u'"')
[docs]def uriref(string):
"""
If `string` is a valid NTriples URI reference, extract and return the URI (as a string).
Otherwise return `None`.
"""
match = uriref.pattern.match(string)
if not match:
return None
return ntriples_unescape(match.group(1))
uri_pattern = r'<([^:]+:.+)>'
uriref.pattern = re.compile(uri_pattern + '$')
[docs]def nodeid(string):
"""
If `string` is a valid NTriples BNode reference, extract and return the node id.
Otherwise return `None`.
"""
match = nodeid.pattern.match(string)
if not match:
return None
return match.group(1)
nodeid.pattern = re.compile(r'_:([A-Za-z][A-Za-z0-9]*)$')
[docs]def literal(string):
"""
If `string` is a valid literal in NTriples syntax, return its value, lang tag and type.
Use `None` if there is no language tag or no datatype.
If `string` is not a valid literal return `None`.
"""
match = literal.pattern.match(string)
if not match:
return None
label, lang, dtype = match.groups()
return ntriples_unescape(label), ntriples_unescape(dtype), lang
litvalue = r'"([^"\\]*(?:\\.[^"\\]*)*)"'
litinfo = r'(?:@([a-z]+(?:-[a-z0-9]+)*)|\^\^' + uri_pattern + r')?'
literal.pattern = re.compile(litvalue + litinfo + '$')
[docs]def to_bytes(text):
"""
If TEXT is a Unicode string, return a byte string in utf-8.
Otherwise simply return TEXT.
:param text: Text to be converted.
:type text: str|bytes|unicode
:rtype: bytes
"""
if isinstance(text, unicode):
return text.encode('utf-8')
return text
if sys.version_info[0] > 2:
def to_native_string(text):
"""
Converts text to the native string type of the Python version used.
UTF-8 encoding is used if the text needs to be encoded or decoded.
:param text: Text to be converted (either Unicode or bytes).
:type text: str|bytes|unicode
:rtype: str
"""
if isinstance(text, bytes):
return str(text, 'utf-8')
return text
else:
[docs] def to_native_string(text):
if isnewbytes(text):
return bytes.__str__(text)
if isinstance(text, native_str):
return text
# Must be Unicode...
return text.encode('utf-8')