Source code for franz.openrdf.util.strings

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# pylint: disable-msg=C0103

################################################################################
# Copyright (c) 2006-2017 Franz Inc.  
# All rights reserved. This program and the accompanying materials are
# made available under the terms of the MIT License which accompanies
# this distribution, and is available at http://opensource.org/licenses/MIT
################################################################################

from __future__ import absolute_import
from __future__ import unicode_literals
from future.builtins import chr
from future.types import newbytes
from future.utils import native_str, isnewbytes
from past.builtins import unicode
import ast
import sys

"""
A strings utility module for helper functions.
"""

import re

###############################################################################
## Canonical NTriples encoding
###############################################################################


[docs]def encode_ntriple_string(string): """ Return a unicode string escaped according to N-Triples canonical encoding rules. """ if not isinstance(string, unicode): string = unicode(string, 'utf-8') for char, replacement in ESCAPES: string = string.replace(char, replacement) return string
ESCAPES = [ # Replacements will be performed sequentially, so backslash # must be the first character on the list (chr(0x5C), r'\\'), (chr(0x0A), r'\n'), (chr(0x0D), r'\r'), (chr(0x22), r'\"'), ] uri_escaped_chars = re.compile(r'[\x00-\x20<>"{}|^`\\]')
[docs]def uri_escape_match(match): """ Converts a Match object representing a single character into an ntriple escape sequence. """ code = ord(match.group()) if code <= 0xffff: return '\\u%04x' % code else: return '\\U%08x' % code
[docs]def encode_ntriple_uri(uri): """ Converts a string URI to ntriples by adding angle brackets and escaping special characters. """ return '<' + uri_escaped_chars.sub(uri_escape_match, uri) + '>'
[docs]def ntriples_unescape(text): """ Decodes ntriples escape sequences in a string. Actually decodes a superset of said sequences. """ if text is None: return None return ast.literal_eval(u'u"' + text + u'"')
[docs]def uriref(string): """ If `string` is a valid NTriples URI reference, extract and return the URI (as a string). Otherwise return `None`. """ match = uriref.pattern.match(string) if not match: return None return ntriples_unescape(match.group(1))
uri_pattern = r'<([^:]+:.+)>' uriref.pattern = re.compile(uri_pattern + '$')
[docs]def nodeid(string): """ If `string` is a valid NTriples BNode reference, extract and return the node id. Otherwise return `None`. """ match = nodeid.pattern.match(string) if not match: return None return match.group(1)
nodeid.pattern = re.compile(r'_:([A-Za-z][A-Za-z0-9]*)$')
[docs]def literal(string): """ If `string` is a valid literal in NTriples syntax, return its value, lang tag and type. Use `None` if there is no language tag or no datatype. If `string` is not a valid literal return `None`. """ match = literal.pattern.match(string) if not match: return None label, lang, dtype = match.groups() return ntriples_unescape(label), ntriples_unescape(dtype), lang
litvalue = r'"([^"\\]*(?:\\.[^"\\]*)*)"' litinfo = r'(?:@([a-z]+(?:-[a-z0-9]+)*)|\^\^' + uri_pattern + r')?' literal.pattern = re.compile(litvalue + litinfo + '$')
[docs]def to_bytes(text): """ If TEXT is a Unicode string, return a byte string in utf-8. Otherwise simply return TEXT. :param text: Text to be converted. :type text: str|bytes|unicode :rtype: bytes """ if isinstance(text, unicode): return text.encode('utf-8') return text
if sys.version_info[0] > 2: def to_native_string(text): """ Converts text to the native string type of the Python version used. UTF-8 encoding is used if the text needs to be encoded or decoded. :param text: Text to be converted (either Unicode or bytes). :type text: str|bytes|unicode :rtype: str """ if isinstance(text, bytes): return str(text, 'utf-8') return text else:
[docs] def to_native_string(text): if isnewbytes(text): return bytes.__str__(text) if isinstance(text, native_str): return text # Must be Unicode... return text.encode('utf-8')