data.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697
  1. """
  2. pygments.lexers.data
  3. ~~~~~~~~~~~~~~~~~~~~
  4. Lexers for data file format.
  5. :copyright: Copyright 2006-2021 by the Pygments team, see AUTHORS.
  6. :license: BSD, see LICENSE for details.
  7. """
  8. import re
  9. from pygments.lexer import Lexer, RegexLexer, ExtendedRegexLexer, LexerContext, \
  10. include, bygroups, inherit
  11. from pygments.token import Text, Comment, Keyword, Name, String, Number, \
  12. Punctuation, Literal, Error
  13. __all__ = ['YamlLexer', 'JsonLexer', 'JsonBareObjectLexer', 'JsonLdLexer']
  14. class YamlLexerContext(LexerContext):
  15. """Indentation context for the YAML lexer."""
  16. def __init__(self, *args, **kwds):
  17. super().__init__(*args, **kwds)
  18. self.indent_stack = []
  19. self.indent = -1
  20. self.next_indent = 0
  21. self.block_scalar_indent = None
  22. class YamlLexer(ExtendedRegexLexer):
  23. """
  24. Lexer for `YAML <http://yaml.org/>`_, a human-friendly data serialization
  25. language.
  26. .. versionadded:: 0.11
  27. """
  28. name = 'YAML'
  29. aliases = ['yaml']
  30. filenames = ['*.yaml', '*.yml']
  31. mimetypes = ['text/x-yaml']
  32. def something(token_class):
  33. """Do not produce empty tokens."""
  34. def callback(lexer, match, context):
  35. text = match.group()
  36. if not text:
  37. return
  38. yield match.start(), token_class, text
  39. context.pos = match.end()
  40. return callback
  41. def reset_indent(token_class):
  42. """Reset the indentation levels."""
  43. def callback(lexer, match, context):
  44. text = match.group()
  45. context.indent_stack = []
  46. context.indent = -1
  47. context.next_indent = 0
  48. context.block_scalar_indent = None
  49. yield match.start(), token_class, text
  50. context.pos = match.end()
  51. return callback
  52. def save_indent(token_class, start=False):
  53. """Save a possible indentation level."""
  54. def callback(lexer, match, context):
  55. text = match.group()
  56. extra = ''
  57. if start:
  58. context.next_indent = len(text)
  59. if context.next_indent < context.indent:
  60. while context.next_indent < context.indent:
  61. context.indent = context.indent_stack.pop()
  62. if context.next_indent > context.indent:
  63. extra = text[context.indent:]
  64. text = text[:context.indent]
  65. else:
  66. context.next_indent += len(text)
  67. if text:
  68. yield match.start(), token_class, text
  69. if extra:
  70. yield match.start()+len(text), token_class.Error, extra
  71. context.pos = match.end()
  72. return callback
  73. def set_indent(token_class, implicit=False):
  74. """Set the previously saved indentation level."""
  75. def callback(lexer, match, context):
  76. text = match.group()
  77. if context.indent < context.next_indent:
  78. context.indent_stack.append(context.indent)
  79. context.indent = context.next_indent
  80. if not implicit:
  81. context.next_indent += len(text)
  82. yield match.start(), token_class, text
  83. context.pos = match.end()
  84. return callback
  85. def set_block_scalar_indent(token_class):
  86. """Set an explicit indentation level for a block scalar."""
  87. def callback(lexer, match, context):
  88. text = match.group()
  89. context.block_scalar_indent = None
  90. if not text:
  91. return
  92. increment = match.group(1)
  93. if increment:
  94. current_indent = max(context.indent, 0)
  95. increment = int(increment)
  96. context.block_scalar_indent = current_indent + increment
  97. if text:
  98. yield match.start(), token_class, text
  99. context.pos = match.end()
  100. return callback
  101. def parse_block_scalar_empty_line(indent_token_class, content_token_class):
  102. """Process an empty line in a block scalar."""
  103. def callback(lexer, match, context):
  104. text = match.group()
  105. if (context.block_scalar_indent is None or
  106. len(text) <= context.block_scalar_indent):
  107. if text:
  108. yield match.start(), indent_token_class, text
  109. else:
  110. indentation = text[:context.block_scalar_indent]
  111. content = text[context.block_scalar_indent:]
  112. yield match.start(), indent_token_class, indentation
  113. yield (match.start()+context.block_scalar_indent,
  114. content_token_class, content)
  115. context.pos = match.end()
  116. return callback
  117. def parse_block_scalar_indent(token_class):
  118. """Process indentation spaces in a block scalar."""
  119. def callback(lexer, match, context):
  120. text = match.group()
  121. if context.block_scalar_indent is None:
  122. if len(text) <= max(context.indent, 0):
  123. context.stack.pop()
  124. context.stack.pop()
  125. return
  126. context.block_scalar_indent = len(text)
  127. else:
  128. if len(text) < context.block_scalar_indent:
  129. context.stack.pop()
  130. context.stack.pop()
  131. return
  132. if text:
  133. yield match.start(), token_class, text
  134. context.pos = match.end()
  135. return callback
  136. def parse_plain_scalar_indent(token_class):
  137. """Process indentation spaces in a plain scalar."""
  138. def callback(lexer, match, context):
  139. text = match.group()
  140. if len(text) <= context.indent:
  141. context.stack.pop()
  142. context.stack.pop()
  143. return
  144. if text:
  145. yield match.start(), token_class, text
  146. context.pos = match.end()
  147. return callback
  148. tokens = {
  149. # the root rules
  150. 'root': [
  151. # ignored whitespaces
  152. (r'[ ]+(?=#|$)', Text),
  153. # line breaks
  154. (r'\n+', Text),
  155. # a comment
  156. (r'#[^\n]*', Comment.Single),
  157. # the '%YAML' directive
  158. (r'^%YAML(?=[ ]|$)', reset_indent(Name.Tag), 'yaml-directive'),
  159. # the %TAG directive
  160. (r'^%TAG(?=[ ]|$)', reset_indent(Name.Tag), 'tag-directive'),
  161. # document start and document end indicators
  162. (r'^(?:---|\.\.\.)(?=[ ]|$)', reset_indent(Name.Namespace),
  163. 'block-line'),
  164. # indentation spaces
  165. (r'[ ]*(?!\s|$)', save_indent(Text, start=True),
  166. ('block-line', 'indentation')),
  167. ],
  168. # trailing whitespaces after directives or a block scalar indicator
  169. 'ignored-line': [
  170. # ignored whitespaces
  171. (r'[ ]+(?=#|$)', Text),
  172. # a comment
  173. (r'#[^\n]*', Comment.Single),
  174. # line break
  175. (r'\n', Text, '#pop:2'),
  176. ],
  177. # the %YAML directive
  178. 'yaml-directive': [
  179. # the version number
  180. (r'([ ]+)([0-9]+\.[0-9]+)',
  181. bygroups(Text, Number), 'ignored-line'),
  182. ],
  183. # the %TAG directive
  184. 'tag-directive': [
  185. # a tag handle and the corresponding prefix
  186. (r'([ ]+)(!|![\w-]*!)'
  187. r'([ ]+)(!|!?[\w;/?:@&=+$,.!~*\'()\[\]%-]+)',
  188. bygroups(Text, Keyword.Type, Text, Keyword.Type),
  189. 'ignored-line'),
  190. ],
  191. # block scalar indicators and indentation spaces
  192. 'indentation': [
  193. # trailing whitespaces are ignored
  194. (r'[ ]*$', something(Text), '#pop:2'),
  195. # whitespaces preceding block collection indicators
  196. (r'[ ]+(?=[?:-](?:[ ]|$))', save_indent(Text)),
  197. # block collection indicators
  198. (r'[?:-](?=[ ]|$)', set_indent(Punctuation.Indicator)),
  199. # the beginning a block line
  200. (r'[ ]*', save_indent(Text), '#pop'),
  201. ],
  202. # an indented line in the block context
  203. 'block-line': [
  204. # the line end
  205. (r'[ ]*(?=#|$)', something(Text), '#pop'),
  206. # whitespaces separating tokens
  207. (r'[ ]+', Text),
  208. # key with colon
  209. (r'''([^#,:?\[\]{}"'\n]+)(:)(?=[ ]|$)''',
  210. bygroups(Name.Tag, set_indent(Punctuation, implicit=True))),
  211. # tags, anchors and aliases,
  212. include('descriptors'),
  213. # block collections and scalars
  214. include('block-nodes'),
  215. # flow collections and quoted scalars
  216. include('flow-nodes'),
  217. # a plain scalar
  218. (r'(?=[^\s?:,\[\]{}#&*!|>\'"%@`-]|[?:-]\S)',
  219. something(Name.Variable),
  220. 'plain-scalar-in-block-context'),
  221. ],
  222. # tags, anchors, aliases
  223. 'descriptors': [
  224. # a full-form tag
  225. (r'!<[\w#;/?:@&=+$,.!~*\'()\[\]%-]+>', Keyword.Type),
  226. # a tag in the form '!', '!suffix' or '!handle!suffix'
  227. (r'!(?:[\w-]+!)?'
  228. r'[\w#;/?:@&=+$,.!~*\'()\[\]%-]*', Keyword.Type),
  229. # an anchor
  230. (r'&[\w-]+', Name.Label),
  231. # an alias
  232. (r'\*[\w-]+', Name.Variable),
  233. ],
  234. # block collections and scalars
  235. 'block-nodes': [
  236. # implicit key
  237. (r':(?=[ ]|$)', set_indent(Punctuation.Indicator, implicit=True)),
  238. # literal and folded scalars
  239. (r'[|>]', Punctuation.Indicator,
  240. ('block-scalar-content', 'block-scalar-header')),
  241. ],
  242. # flow collections and quoted scalars
  243. 'flow-nodes': [
  244. # a flow sequence
  245. (r'\[', Punctuation.Indicator, 'flow-sequence'),
  246. # a flow mapping
  247. (r'\{', Punctuation.Indicator, 'flow-mapping'),
  248. # a single-quoted scalar
  249. (r'\'', String, 'single-quoted-scalar'),
  250. # a double-quoted scalar
  251. (r'\"', String, 'double-quoted-scalar'),
  252. ],
  253. # the content of a flow collection
  254. 'flow-collection': [
  255. # whitespaces
  256. (r'[ ]+', Text),
  257. # line breaks
  258. (r'\n+', Text),
  259. # a comment
  260. (r'#[^\n]*', Comment.Single),
  261. # simple indicators
  262. (r'[?:,]', Punctuation.Indicator),
  263. # tags, anchors and aliases
  264. include('descriptors'),
  265. # nested collections and quoted scalars
  266. include('flow-nodes'),
  267. # a plain scalar
  268. (r'(?=[^\s?:,\[\]{}#&*!|>\'"%@`])',
  269. something(Name.Variable),
  270. 'plain-scalar-in-flow-context'),
  271. ],
  272. # a flow sequence indicated by '[' and ']'
  273. 'flow-sequence': [
  274. # include flow collection rules
  275. include('flow-collection'),
  276. # the closing indicator
  277. (r'\]', Punctuation.Indicator, '#pop'),
  278. ],
  279. # a flow mapping indicated by '{' and '}'
  280. 'flow-mapping': [
  281. # key with colon
  282. (r'''([^,:?\[\]{}"'\n]+)(:)(?=[ ]|$)''',
  283. bygroups(Name.Tag, Punctuation)),
  284. # include flow collection rules
  285. include('flow-collection'),
  286. # the closing indicator
  287. (r'\}', Punctuation.Indicator, '#pop'),
  288. ],
  289. # block scalar lines
  290. 'block-scalar-content': [
  291. # line break
  292. (r'\n', Text),
  293. # empty line
  294. (r'^[ ]+$',
  295. parse_block_scalar_empty_line(Text, Name.Constant)),
  296. # indentation spaces (we may leave the state here)
  297. (r'^[ ]*', parse_block_scalar_indent(Text)),
  298. # line content
  299. (r'[\S\t ]+', Name.Constant),
  300. ],
  301. # the content of a literal or folded scalar
  302. 'block-scalar-header': [
  303. # indentation indicator followed by chomping flag
  304. (r'([1-9])?[+-]?(?=[ ]|$)',
  305. set_block_scalar_indent(Punctuation.Indicator),
  306. 'ignored-line'),
  307. # chomping flag followed by indentation indicator
  308. (r'[+-]?([1-9])?(?=[ ]|$)',
  309. set_block_scalar_indent(Punctuation.Indicator),
  310. 'ignored-line'),
  311. ],
  312. # ignored and regular whitespaces in quoted scalars
  313. 'quoted-scalar-whitespaces': [
  314. # leading and trailing whitespaces are ignored
  315. (r'^[ ]+', Text),
  316. (r'[ ]+$', Text),
  317. # line breaks are ignored
  318. (r'\n+', Text),
  319. # other whitespaces are a part of the value
  320. (r'[ ]+', Name.Variable),
  321. ],
  322. # single-quoted scalars
  323. 'single-quoted-scalar': [
  324. # include whitespace and line break rules
  325. include('quoted-scalar-whitespaces'),
  326. # escaping of the quote character
  327. (r'\'\'', String.Escape),
  328. # regular non-whitespace characters
  329. (r'[^\s\']+', String),
  330. # the closing quote
  331. (r'\'', String, '#pop'),
  332. ],
  333. # double-quoted scalars
  334. 'double-quoted-scalar': [
  335. # include whitespace and line break rules
  336. include('quoted-scalar-whitespaces'),
  337. # escaping of special characters
  338. (r'\\[0abt\tn\nvfre "\\N_LP]', String),
  339. # escape codes
  340. (r'\\(?:x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})',
  341. String.Escape),
  342. # regular non-whitespace characters
  343. (r'[^\s"\\]+', String),
  344. # the closing quote
  345. (r'"', String, '#pop'),
  346. ],
  347. # the beginning of a new line while scanning a plain scalar
  348. 'plain-scalar-in-block-context-new-line': [
  349. # empty lines
  350. (r'^[ ]+$', Text),
  351. # line breaks
  352. (r'\n+', Text),
  353. # document start and document end indicators
  354. (r'^(?=---|\.\.\.)', something(Name.Namespace), '#pop:3'),
  355. # indentation spaces (we may leave the block line state here)
  356. (r'^[ ]*', parse_plain_scalar_indent(Text), '#pop'),
  357. ],
  358. # a plain scalar in the block context
  359. 'plain-scalar-in-block-context': [
  360. # the scalar ends with the ':' indicator
  361. (r'[ ]*(?=:[ ]|:$)', something(Text), '#pop'),
  362. # the scalar ends with whitespaces followed by a comment
  363. (r'[ ]+(?=#)', Text, '#pop'),
  364. # trailing whitespaces are ignored
  365. (r'[ ]+$', Text),
  366. # line breaks are ignored
  367. (r'\n+', Text, 'plain-scalar-in-block-context-new-line'),
  368. # other whitespaces are a part of the value
  369. (r'[ ]+', Literal.Scalar.Plain),
  370. # regular non-whitespace characters
  371. (r'(?::(?!\s)|[^\s:])+', Literal.Scalar.Plain),
  372. ],
  373. # a plain scalar is the flow context
  374. 'plain-scalar-in-flow-context': [
  375. # the scalar ends with an indicator character
  376. (r'[ ]*(?=[,:?\[\]{}])', something(Text), '#pop'),
  377. # the scalar ends with a comment
  378. (r'[ ]+(?=#)', Text, '#pop'),
  379. # leading and trailing whitespaces are ignored
  380. (r'^[ ]+', Text),
  381. (r'[ ]+$', Text),
  382. # line breaks are ignored
  383. (r'\n+', Text),
  384. # other whitespaces are a part of the value
  385. (r'[ ]+', Name.Variable),
  386. # regular non-whitespace characters
  387. (r'[^\s,:?\[\]{}]+', Name.Variable),
  388. ],
  389. }
  390. def get_tokens_unprocessed(self, text=None, context=None):
  391. if context is None:
  392. context = YamlLexerContext(text, 0)
  393. return super().get_tokens_unprocessed(text, context)
  394. class JsonLexer(Lexer):
  395. """
  396. For JSON data structures.
  397. .. versionadded:: 1.5
  398. """
  399. name = 'JSON'
  400. aliases = ['json', 'json-object']
  401. filenames = ['*.json', 'Pipfile.lock']
  402. mimetypes = ['application/json', 'application/json-object']
  403. # No validation of integers, floats, or constants is done.
  404. # As long as the characters are members of the following
  405. # sets, the token will be considered valid. For example,
  406. #
  407. # "--1--" is parsed as an integer
  408. # "1...eee" is parsed as a float
  409. # "trustful" is parsed as a constant
  410. #
  411. integers = set('-0123456789')
  412. floats = set('.eE+')
  413. constants = set('truefalsenull') # true|false|null
  414. hexadecimals = set('0123456789abcdefABCDEF')
  415. punctuations = set('{}[],')
  416. whitespaces = {'\u0020', '\u000a', '\u000d', '\u0009'}
  417. def get_tokens_unprocessed(self, text):
  418. """Parse JSON data."""
  419. in_string = False
  420. in_escape = False
  421. in_unicode_escape = 0
  422. in_whitespace = False
  423. in_constant = False
  424. in_number = False
  425. in_float = False
  426. in_punctuation = False
  427. start = 0
  428. # The queue is used to store data that may need to be tokenized
  429. # differently based on what follows. In particular, JSON object
  430. # keys are tokenized differently than string values, but cannot
  431. # be distinguished until punctuation is encountered outside the
  432. # string.
  433. #
  434. # A ":" character after the string indicates that the string is
  435. # an object key; any other character indicates the string is a
  436. # regular string value.
  437. #
  438. # The queue holds tuples that contain the following data:
  439. #
  440. # (start_index, token_type, text)
  441. #
  442. # By default the token type of text in double quotes is
  443. # String.Double. The token type will be replaced if a colon
  444. # is encountered after the string closes.
  445. #
  446. queue = []
  447. for stop, character in enumerate(text):
  448. if in_string:
  449. if in_unicode_escape:
  450. if character in self.hexadecimals:
  451. in_unicode_escape -= 1
  452. if not in_unicode_escape:
  453. in_escape = False
  454. else:
  455. in_unicode_escape = 0
  456. in_escape = False
  457. elif in_escape:
  458. if character == 'u':
  459. in_unicode_escape = 4
  460. else:
  461. in_escape = False
  462. elif character == '\\':
  463. in_escape = True
  464. elif character == '"':
  465. queue.append((start, String.Double, text[start:stop + 1]))
  466. in_string = False
  467. in_escape = False
  468. in_unicode_escape = 0
  469. continue
  470. elif in_whitespace:
  471. if character in self.whitespaces:
  472. continue
  473. if queue:
  474. queue.append((start, Text, text[start:stop]))
  475. else:
  476. yield start, Text, text[start:stop]
  477. in_whitespace = False
  478. # Fall through so the new character can be evaluated.
  479. elif in_constant:
  480. if character in self.constants:
  481. continue
  482. yield start, Keyword.Constant, text[start:stop]
  483. in_constant = False
  484. # Fall through so the new character can be evaluated.
  485. elif in_number:
  486. if character in self.integers:
  487. continue
  488. elif character in self.floats:
  489. in_float = True
  490. continue
  491. if in_float:
  492. yield start, Number.Float, text[start:stop]
  493. else:
  494. yield start, Number.Integer, text[start:stop]
  495. in_number = False
  496. in_float = False
  497. # Fall through so the new character can be evaluated.
  498. elif in_punctuation:
  499. if character in self.punctuations:
  500. continue
  501. yield start, Punctuation, text[start:stop]
  502. in_punctuation = False
  503. # Fall through so the new character can be evaluated.
  504. start = stop
  505. if character == '"':
  506. in_string = True
  507. elif character in self.whitespaces:
  508. in_whitespace = True
  509. elif character in {'f', 'n', 't'}: # The first letters of true|false|null
  510. # Exhaust the queue. Accept the existing token types.
  511. yield from queue
  512. queue.clear()
  513. in_constant = True
  514. elif character in self.integers:
  515. # Exhaust the queue. Accept the existing token types.
  516. yield from queue
  517. queue.clear()
  518. in_number = True
  519. elif character == ':':
  520. # Yield from the queue. Replace string token types.
  521. for _start, _token, _text in queue:
  522. if _token is Text:
  523. yield _start, _token, _text
  524. elif _token is String.Double:
  525. yield _start, Name.Tag, _text
  526. else:
  527. yield _start, Error, _text
  528. queue.clear()
  529. in_punctuation = True
  530. elif character in self.punctuations:
  531. # Exhaust the queue. Accept the existing token types.
  532. yield from queue
  533. queue.clear()
  534. in_punctuation = True
  535. else:
  536. # Exhaust the queue. Accept the existing token types.
  537. yield from queue
  538. queue.clear()
  539. yield start, Error, character
  540. # Yield any remaining text.
  541. yield from queue
  542. if in_string:
  543. yield start, Error, text[start:]
  544. elif in_float:
  545. yield start, Number.Float, text[start:]
  546. elif in_number:
  547. yield start, Number.Integer, text[start:]
  548. elif in_constant:
  549. yield start, Keyword.Constant, text[start:]
  550. elif in_whitespace:
  551. yield start, Text, text[start:]
  552. elif in_punctuation:
  553. yield start, Punctuation, text[start:]
  554. class JsonBareObjectLexer(JsonLexer):
  555. """
  556. For JSON data structures (with missing object curly braces).
  557. .. versionadded:: 2.2
  558. .. deprecated:: 2.8.0
  559. Behaves the same as `JsonLexer` now.
  560. """
  561. name = 'JSONBareObject'
  562. aliases = []
  563. filenames = []
  564. mimetypes = []
  565. class JsonLdLexer(JsonLexer):
  566. """
  567. For `JSON-LD <https://json-ld.org/>`_ linked data.
  568. .. versionadded:: 2.0
  569. """
  570. name = 'JSON-LD'
  571. aliases = ['jsonld', 'json-ld']
  572. filenames = ['*.jsonld']
  573. mimetypes = ['application/ld+json']
  574. json_ld_keywords = {
  575. '"@%s"' % keyword
  576. for keyword in (
  577. 'base',
  578. 'container',
  579. 'context',
  580. 'direction',
  581. 'graph',
  582. 'id',
  583. 'import',
  584. 'included',
  585. 'index',
  586. 'json',
  587. 'language',
  588. 'list',
  589. 'nest',
  590. 'none',
  591. 'prefix',
  592. 'propagate',
  593. 'protected',
  594. 'reverse',
  595. 'set',
  596. 'type',
  597. 'value',
  598. 'version',
  599. 'vocab',
  600. )
  601. }
  602. def get_tokens_unprocessed(self, text):
  603. for start, token, value in super().get_tokens_unprocessed(text):
  604. if token is Name.Tag and value in self.json_ld_keywords:
  605. yield start, Name.Decorator, value
  606. else:
  607. yield start, token, value