"""
pygments.lexers.data
~~~~~~~~~~~~~~~~~~~~
Lexers for data file format.
:copyright: Copyright 2006-2021 by the Pygments team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
import re
from pygments.lexer import Lexer, RegexLexer, ExtendedRegexLexer, LexerContext, \
include, bygroups, inherit
from pygments.token import Text, Comment, Keyword, Name, String, Number, \
Punctuation, Literal, Error
__all__ = ['YamlLexer', 'JsonLexer', 'JsonBareObjectLexer', 'JsonLdLexer']
class YamlLexerContext(LexerContext):
"""Indentation context for the YAML lexer."""
def __init__(self, *args, **kwds):
super().__init__(*args, **kwds)
self.indent_stack = []
self.indent = -1
self.next_indent = 0
self.block_scalar_indent = None
class YamlLexer(ExtendedRegexLexer):
"""
Lexer for `YAML `_, a human-friendly data serialization
language.
.. versionadded:: 0.11
"""
name = 'YAML'
aliases = ['yaml']
filenames = ['*.yaml', '*.yml']
mimetypes = ['text/x-yaml']
def something(token_class):
"""Do not produce empty tokens."""
def callback(lexer, match, context):
text = match.group()
if not text:
return
yield match.start(), token_class, text
context.pos = match.end()
return callback
def reset_indent(token_class):
"""Reset the indentation levels."""
def callback(lexer, match, context):
text = match.group()
context.indent_stack = []
context.indent = -1
context.next_indent = 0
context.block_scalar_indent = None
yield match.start(), token_class, text
context.pos = match.end()
return callback
def save_indent(token_class, start=False):
"""Save a possible indentation level."""
def callback(lexer, match, context):
text = match.group()
extra = ''
if start:
context.next_indent = len(text)
if context.next_indent < context.indent:
while context.next_indent < context.indent:
context.indent = context.indent_stack.pop()
if context.next_indent > context.indent:
extra = text[context.indent:]
text = text[:context.indent]
else:
context.next_indent += len(text)
if text:
yield match.start(), token_class, text
if extra:
yield match.start()+len(text), token_class.Error, extra
context.pos = match.end()
return callback
def set_indent(token_class, implicit=False):
"""Set the previously saved indentation level."""
def callback(lexer, match, context):
text = match.group()
if context.indent < context.next_indent:
context.indent_stack.append(context.indent)
context.indent = context.next_indent
if not implicit:
context.next_indent += len(text)
yield match.start(), token_class, text
context.pos = match.end()
return callback
def set_block_scalar_indent(token_class):
"""Set an explicit indentation level for a block scalar."""
def callback(lexer, match, context):
text = match.group()
context.block_scalar_indent = None
if not text:
return
increment = match.group(1)
if increment:
current_indent = max(context.indent, 0)
increment = int(increment)
context.block_scalar_indent = current_indent + increment
if text:
yield match.start(), token_class, text
context.pos = match.end()
return callback
def parse_block_scalar_empty_line(indent_token_class, content_token_class):
"""Process an empty line in a block scalar."""
def callback(lexer, match, context):
text = match.group()
if (context.block_scalar_indent is None or
len(text) <= context.block_scalar_indent):
if text:
yield match.start(), indent_token_class, text
else:
indentation = text[:context.block_scalar_indent]
content = text[context.block_scalar_indent:]
yield match.start(), indent_token_class, indentation
yield (match.start()+context.block_scalar_indent,
content_token_class, content)
context.pos = match.end()
return callback
def parse_block_scalar_indent(token_class):
"""Process indentation spaces in a block scalar."""
def callback(lexer, match, context):
text = match.group()
if context.block_scalar_indent is None:
if len(text) <= max(context.indent, 0):
context.stack.pop()
context.stack.pop()
return
context.block_scalar_indent = len(text)
else:
if len(text) < context.block_scalar_indent:
context.stack.pop()
context.stack.pop()
return
if text:
yield match.start(), token_class, text
context.pos = match.end()
return callback
def parse_plain_scalar_indent(token_class):
"""Process indentation spaces in a plain scalar."""
def callback(lexer, match, context):
text = match.group()
if len(text) <= context.indent:
context.stack.pop()
context.stack.pop()
return
if text:
yield match.start(), token_class, text
context.pos = match.end()
return callback
tokens = {
# the root rules
'root': [
# ignored whitespaces
(r'[ ]+(?=#|$)', Text),
# line breaks
(r'\n+', Text),
# a comment
(r'#[^\n]*', Comment.Single),
# the '%YAML' directive
(r'^%YAML(?=[ ]|$)', reset_indent(Name.Tag), 'yaml-directive'),
# the %TAG directive
(r'^%TAG(?=[ ]|$)', reset_indent(Name.Tag), 'tag-directive'),
# document start and document end indicators
(r'^(?:---|\.\.\.)(?=[ ]|$)', reset_indent(Name.Namespace),
'block-line'),
# indentation spaces
(r'[ ]*(?!\s|$)', save_indent(Text, start=True),
('block-line', 'indentation')),
],
# trailing whitespaces after directives or a block scalar indicator
'ignored-line': [
# ignored whitespaces
(r'[ ]+(?=#|$)', Text),
# a comment
(r'#[^\n]*', Comment.Single),
# line break
(r'\n', Text, '#pop:2'),
],
# the %YAML directive
'yaml-directive': [
# the version number
(r'([ ]+)([0-9]+\.[0-9]+)',
bygroups(Text, Number), 'ignored-line'),
],
# the %TAG directive
'tag-directive': [
# a tag handle and the corresponding prefix
(r'([ ]+)(!|![\w-]*!)'
r'([ ]+)(!|!?[\w;/?:@&=+$,.!~*\'()\[\]%-]+)',
bygroups(Text, Keyword.Type, Text, Keyword.Type),
'ignored-line'),
],
# block scalar indicators and indentation spaces
'indentation': [
# trailing whitespaces are ignored
(r'[ ]*$', something(Text), '#pop:2'),
# whitespaces preceding block collection indicators
(r'[ ]+(?=[?:-](?:[ ]|$))', save_indent(Text)),
# block collection indicators
(r'[?:-](?=[ ]|$)', set_indent(Punctuation.Indicator)),
# the beginning a block line
(r'[ ]*', save_indent(Text), '#pop'),
],
# an indented line in the block context
'block-line': [
# the line end
(r'[ ]*(?=#|$)', something(Text), '#pop'),
# whitespaces separating tokens
(r'[ ]+', Text),
# key with colon
(r'''([^#,:?\[\]{}"'\n]+)(:)(?=[ ]|$)''',
bygroups(Name.Tag, set_indent(Punctuation, implicit=True))),
# tags, anchors and aliases,
include('descriptors'),
# block collections and scalars
include('block-nodes'),
# flow collections and quoted scalars
include('flow-nodes'),
# a plain scalar
(r'(?=[^\s?:,\[\]{}#&*!|>\'"%@`-]|[?:-]\S)',
something(Name.Variable),
'plain-scalar-in-block-context'),
],
# tags, anchors, aliases
'descriptors': [
# a full-form tag
(r'!<[\w#;/?:@&=+$,.!~*\'()\[\]%-]+>', Keyword.Type),
# a tag in the form '!', '!suffix' or '!handle!suffix'
(r'!(?:[\w-]+!)?'
r'[\w#;/?:@&=+$,.!~*\'()\[\]%-]*', Keyword.Type),
# an anchor
(r'&[\w-]+', Name.Label),
# an alias
(r'\*[\w-]+', Name.Variable),
],
# block collections and scalars
'block-nodes': [
# implicit key
(r':(?=[ ]|$)', set_indent(Punctuation.Indicator, implicit=True)),
# literal and folded scalars
(r'[|>]', Punctuation.Indicator,
('block-scalar-content', 'block-scalar-header')),
],
# flow collections and quoted scalars
'flow-nodes': [
# a flow sequence
(r'\[', Punctuation.Indicator, 'flow-sequence'),
# a flow mapping
(r'\{', Punctuation.Indicator, 'flow-mapping'),
# a single-quoted scalar
(r'\'', String, 'single-quoted-scalar'),
# a double-quoted scalar
(r'\"', String, 'double-quoted-scalar'),
],
# the content of a flow collection
'flow-collection': [
# whitespaces
(r'[ ]+', Text),
# line breaks
(r'\n+', Text),
# a comment
(r'#[^\n]*', Comment.Single),
# simple indicators
(r'[?:,]', Punctuation.Indicator),
# tags, anchors and aliases
include('descriptors'),
# nested collections and quoted scalars
include('flow-nodes'),
# a plain scalar
(r'(?=[^\s?:,\[\]{}#&*!|>\'"%@`])',
something(Name.Variable),
'plain-scalar-in-flow-context'),
],
# a flow sequence indicated by '[' and ']'
'flow-sequence': [
# include flow collection rules
include('flow-collection'),
# the closing indicator
(r'\]', Punctuation.Indicator, '#pop'),
],
# a flow mapping indicated by '{' and '}'
'flow-mapping': [
# key with colon
(r'''([^,:?\[\]{}"'\n]+)(:)(?=[ ]|$)''',
bygroups(Name.Tag, Punctuation)),
# include flow collection rules
include('flow-collection'),
# the closing indicator
(r'\}', Punctuation.Indicator, '#pop'),
],
# block scalar lines
'block-scalar-content': [
# line break
(r'\n', Text),
# empty line
(r'^[ ]+$',
parse_block_scalar_empty_line(Text, Name.Constant)),
# indentation spaces (we may leave the state here)
(r'^[ ]*', parse_block_scalar_indent(Text)),
# line content
(r'[\S\t ]+', Name.Constant),
],
# the content of a literal or folded scalar
'block-scalar-header': [
# indentation indicator followed by chomping flag
(r'([1-9])?[+-]?(?=[ ]|$)',
set_block_scalar_indent(Punctuation.Indicator),
'ignored-line'),
# chomping flag followed by indentation indicator
(r'[+-]?([1-9])?(?=[ ]|$)',
set_block_scalar_indent(Punctuation.Indicator),
'ignored-line'),
],
# ignored and regular whitespaces in quoted scalars
'quoted-scalar-whitespaces': [
# leading and trailing whitespaces are ignored
(r'^[ ]+', Text),
(r'[ ]+$', Text),
# line breaks are ignored
(r'\n+', Text),
# other whitespaces are a part of the value
(r'[ ]+', Name.Variable),
],
# single-quoted scalars
'single-quoted-scalar': [
# include whitespace and line break rules
include('quoted-scalar-whitespaces'),
# escaping of the quote character
(r'\'\'', String.Escape),
# regular non-whitespace characters
(r'[^\s\']+', String),
# the closing quote
(r'\'', String, '#pop'),
],
# double-quoted scalars
'double-quoted-scalar': [
# include whitespace and line break rules
include('quoted-scalar-whitespaces'),
# escaping of special characters
(r'\\[0abt\tn\nvfre "\\N_LP]', String),
# escape codes
(r'\\(?:x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})',
String.Escape),
# regular non-whitespace characters
(r'[^\s"\\]+', String),
# the closing quote
(r'"', String, '#pop'),
],
# the beginning of a new line while scanning a plain scalar
'plain-scalar-in-block-context-new-line': [
# empty lines
(r'^[ ]+$', Text),
# line breaks
(r'\n+', Text),
# document start and document end indicators
(r'^(?=---|\.\.\.)', something(Name.Namespace), '#pop:3'),
# indentation spaces (we may leave the block line state here)
(r'^[ ]*', parse_plain_scalar_indent(Text), '#pop'),
],
# a plain scalar in the block context
'plain-scalar-in-block-context': [
# the scalar ends with the ':' indicator
(r'[ ]*(?=:[ ]|:$)', something(Text), '#pop'),
# the scalar ends with whitespaces followed by a comment
(r'[ ]+(?=#)', Text, '#pop'),
# trailing whitespaces are ignored
(r'[ ]+$', Text),
# line breaks are ignored
(r'\n+', Text, 'plain-scalar-in-block-context-new-line'),
# other whitespaces are a part of the value
(r'[ ]+', Literal.Scalar.Plain),
# regular non-whitespace characters
(r'(?::(?!\s)|[^\s:])+', Literal.Scalar.Plain),
],
# a plain scalar is the flow context
'plain-scalar-in-flow-context': [
# the scalar ends with an indicator character
(r'[ ]*(?=[,:?\[\]{}])', something(Text), '#pop'),
# the scalar ends with a comment
(r'[ ]+(?=#)', Text, '#pop'),
# leading and trailing whitespaces are ignored
(r'^[ ]+', Text),
(r'[ ]+$', Text),
# line breaks are ignored
(r'\n+', Text),
# other whitespaces are a part of the value
(r'[ ]+', Name.Variable),
# regular non-whitespace characters
(r'[^\s,:?\[\]{}]+', Name.Variable),
],
}
def get_tokens_unprocessed(self, text=None, context=None):
if context is None:
context = YamlLexerContext(text, 0)
return super().get_tokens_unprocessed(text, context)
class JsonLexer(Lexer):
"""
For JSON data structures.
.. versionadded:: 1.5
"""
name = 'JSON'
aliases = ['json', 'json-object']
filenames = ['*.json', 'Pipfile.lock']
mimetypes = ['application/json', 'application/json-object']
# No validation of integers, floats, or constants is done.
# As long as the characters are members of the following
# sets, the token will be considered valid. For example,
#
# "--1--" is parsed as an integer
# "1...eee" is parsed as a float
# "trustful" is parsed as a constant
#
integers = set('-0123456789')
floats = set('.eE+')
constants = set('truefalsenull') # true|false|null
hexadecimals = set('0123456789abcdefABCDEF')
punctuations = set('{}[],')
whitespaces = {'\u0020', '\u000a', '\u000d', '\u0009'}
def get_tokens_unprocessed(self, text):
"""Parse JSON data."""
in_string = False
in_escape = False
in_unicode_escape = 0
in_whitespace = False
in_constant = False
in_number = False
in_float = False
in_punctuation = False
start = 0
# The queue is used to store data that may need to be tokenized
# differently based on what follows. In particular, JSON object
# keys are tokenized differently than string values, but cannot
# be distinguished until punctuation is encountered outside the
# string.
#
# A ":" character after the string indicates that the string is
# an object key; any other character indicates the string is a
# regular string value.
#
# The queue holds tuples that contain the following data:
#
# (start_index, token_type, text)
#
# By default the token type of text in double quotes is
# String.Double. The token type will be replaced if a colon
# is encountered after the string closes.
#
queue = []
for stop, character in enumerate(text):
if in_string:
if in_unicode_escape:
if character in self.hexadecimals:
in_unicode_escape -= 1
if not in_unicode_escape:
in_escape = False
else:
in_unicode_escape = 0
in_escape = False
elif in_escape:
if character == 'u':
in_unicode_escape = 4
else:
in_escape = False
elif character == '\\':
in_escape = True
elif character == '"':
queue.append((start, String.Double, text[start:stop + 1]))
in_string = False
in_escape = False
in_unicode_escape = 0
continue
elif in_whitespace:
if character in self.whitespaces:
continue
if queue:
queue.append((start, Text, text[start:stop]))
else:
yield start, Text, text[start:stop]
in_whitespace = False
# Fall through so the new character can be evaluated.
elif in_constant:
if character in self.constants:
continue
yield start, Keyword.Constant, text[start:stop]
in_constant = False
# Fall through so the new character can be evaluated.
elif in_number:
if character in self.integers:
continue
elif character in self.floats:
in_float = True
continue
if in_float:
yield start, Number.Float, text[start:stop]
else:
yield start, Number.Integer, text[start:stop]
in_number = False
in_float = False
# Fall through so the new character can be evaluated.
elif in_punctuation:
if character in self.punctuations:
continue
yield start, Punctuation, text[start:stop]
in_punctuation = False
# Fall through so the new character can be evaluated.
start = stop
if character == '"':
in_string = True
elif character in self.whitespaces:
in_whitespace = True
elif character in {'f', 'n', 't'}: # The first letters of true|false|null
# Exhaust the queue. Accept the existing token types.
yield from queue
queue.clear()
in_constant = True
elif character in self.integers:
# Exhaust the queue. Accept the existing token types.
yield from queue
queue.clear()
in_number = True
elif character == ':':
# Yield from the queue. Replace string token types.
for _start, _token, _text in queue:
if _token is Text:
yield _start, _token, _text
elif _token is String.Double:
yield _start, Name.Tag, _text
else:
yield _start, Error, _text
queue.clear()
in_punctuation = True
elif character in self.punctuations:
# Exhaust the queue. Accept the existing token types.
yield from queue
queue.clear()
in_punctuation = True
else:
# Exhaust the queue. Accept the existing token types.
yield from queue
queue.clear()
yield start, Error, character
# Yield any remaining text.
yield from queue
if in_string:
yield start, Error, text[start:]
elif in_float:
yield start, Number.Float, text[start:]
elif in_number:
yield start, Number.Integer, text[start:]
elif in_constant:
yield start, Keyword.Constant, text[start:]
elif in_whitespace:
yield start, Text, text[start:]
elif in_punctuation:
yield start, Punctuation, text[start:]
class JsonBareObjectLexer(JsonLexer):
"""
For JSON data structures (with missing object curly braces).
.. versionadded:: 2.2
.. deprecated:: 2.8.0
Behaves the same as `JsonLexer` now.
"""
name = 'JSONBareObject'
aliases = []
filenames = []
mimetypes = []
class JsonLdLexer(JsonLexer):
"""
For `JSON-LD `_ linked data.
.. versionadded:: 2.0
"""
name = 'JSON-LD'
aliases = ['jsonld', 'json-ld']
filenames = ['*.jsonld']
mimetypes = ['application/ld+json']
json_ld_keywords = {
'"@%s"' % keyword
for keyword in (
'base',
'container',
'context',
'direction',
'graph',
'id',
'import',
'included',
'index',
'json',
'language',
'list',
'nest',
'none',
'prefix',
'propagate',
'protected',
'reverse',
'set',
'type',
'value',
'version',
'vocab',
)
}
def get_tokens_unprocessed(self, text):
for start, token, value in super().get_tokens_unprocessed(text):
if token is Name.Tag and value in self.json_ld_keywords:
yield start, Name.Decorator, value
else:
yield start, token, value