123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613 |
- """
- pygments.lexers.html
- ~~~~~~~~~~~~~~~~~~~~
- Lexers for HTML, XML and related markup.
- :copyright: Copyright 2006-2021 by the Pygments team, see AUTHORS.
- :license: BSD, see LICENSE for details.
- """
- import re
- from pygments.lexer import RegexLexer, ExtendedRegexLexer, include, bygroups, \
- default, using
- from pygments.token import Text, Comment, Operator, Keyword, Name, String, \
- Punctuation
- from pygments.util import looks_like_xml, html_doctype_matches
- from pygments.lexers.javascript import JavascriptLexer
- from pygments.lexers.jvm import ScalaLexer
- from pygments.lexers.css import CssLexer, _indentation, _starts_block
- from pygments.lexers.ruby import RubyLexer
- __all__ = ['HtmlLexer', 'DtdLexer', 'XmlLexer', 'XsltLexer', 'HamlLexer',
- 'ScamlLexer', 'PugLexer']
- class HtmlLexer(RegexLexer):
- """
- For HTML 4 and XHTML 1 markup. Nested JavaScript and CSS is highlighted
- by the appropriate lexer.
- """
- name = 'HTML'
- aliases = ['html']
- filenames = ['*.html', '*.htm', '*.xhtml', '*.xslt']
- mimetypes = ['text/html', 'application/xhtml+xml']
- flags = re.IGNORECASE | re.DOTALL
- tokens = {
- 'root': [
- ('[^<&]+', Text),
- (r'&\S*?;', Name.Entity),
- (r'\<\!\[CDATA\[.*?\]\]\>', Comment.Preproc),
- ('<!--', Comment, 'comment'),
- (r'<\?.*?\?>', Comment.Preproc),
- ('<![^>]*>', Comment.Preproc),
- (r'(<)(\s*)(script)(\s*)',
- bygroups(Punctuation, Text, Name.Tag, Text),
- ('script-content', 'tag')),
- (r'(<)(\s*)(style)(\s*)',
- bygroups(Punctuation, Text, Name.Tag, Text),
- ('style-content', 'tag')),
- # note: this allows tag names not used in HTML like <x:with-dash>,
- # this is to support yet-unknown template engines and the like
- (r'(<)(\s*)([\w:.-]+)',
- bygroups(Punctuation, Text, Name.Tag), 'tag'),
- (r'(<)(\s*)(/)(\s*)([\w:.-]+)(\s*)(>)',
- bygroups(Punctuation, Text, Punctuation, Text, Name.Tag, Text,
- Punctuation)),
- ],
- 'comment': [
- ('[^-]+', Comment),
- ('-->', Comment, '#pop'),
- ('-', Comment),
- ],
- 'tag': [
- (r'\s+', Text),
- (r'([\w:-]+\s*)(=)(\s*)', bygroups(Name.Attribute, Operator, Text),
- 'attr'),
- (r'[\w:-]+', Name.Attribute),
- (r'(/?)(\s*)(>)', bygroups(Punctuation, Text, Punctuation), '#pop'),
- ],
- 'script-content': [
- (r'(<)(\s*)(/)(\s*)(script)(\s*)(>)',
- bygroups(Punctuation, Text, Punctuation, Text, Name.Tag, Text,
- Punctuation), '#pop'),
- (r'.+?(?=<\s*/\s*script\s*>)', using(JavascriptLexer)),
- # fallback cases for when there is no closing script tag
- # first look for newline and then go back into root state
- # if that fails just read the rest of the file
- # this is similar to the error handling logic in lexer.py
- (r'.+?\n', using(JavascriptLexer), '#pop'),
- (r'.+', using(JavascriptLexer), '#pop'),
- ],
- 'style-content': [
- (r'(<)(\s*)(/)(\s*)(style)(\s*)(>)',
- bygroups(Punctuation, Text, Punctuation, Text, Name.Tag, Text,
- Punctuation),'#pop'),
- (r'.+?(?=<\s*/\s*style\s*>)', using(CssLexer)),
- # fallback cases for when there is no closing style tag
- # first look for newline and then go back into root state
- # if that fails just read the rest of the file
- # this is similar to the error handling logic in lexer.py
- (r'.+?\n', using(CssLexer), '#pop'),
- (r'.+', using(CssLexer), '#pop'),
- ],
- 'attr': [
- ('".*?"', String, '#pop'),
- ("'.*?'", String, '#pop'),
- (r'[^\s>]+', String, '#pop'),
- ],
- }
- def analyse_text(text):
- if html_doctype_matches(text):
- return 0.5
- class DtdLexer(RegexLexer):
- """
- A lexer for DTDs (Document Type Definitions).
- .. versionadded:: 1.5
- """
- flags = re.MULTILINE | re.DOTALL
- name = 'DTD'
- aliases = ['dtd']
- filenames = ['*.dtd']
- mimetypes = ['application/xml-dtd']
- tokens = {
- 'root': [
- include('common'),
- (r'(<!ELEMENT)(\s+)(\S+)',
- bygroups(Keyword, Text, Name.Tag), 'element'),
- (r'(<!ATTLIST)(\s+)(\S+)',
- bygroups(Keyword, Text, Name.Tag), 'attlist'),
- (r'(<!ENTITY)(\s+)(\S+)',
- bygroups(Keyword, Text, Name.Entity), 'entity'),
- (r'(<!NOTATION)(\s+)(\S+)',
- bygroups(Keyword, Text, Name.Tag), 'notation'),
- (r'(<!\[)([^\[\s]+)(\s*)(\[)', # conditional sections
- bygroups(Keyword, Name.Entity, Text, Keyword)),
- (r'(<!DOCTYPE)(\s+)([^>\s]+)',
- bygroups(Keyword, Text, Name.Tag)),
- (r'PUBLIC|SYSTEM', Keyword.Constant),
- (r'[\[\]>]', Keyword),
- ],
- 'common': [
- (r'\s+', Text),
- (r'(%|&)[^;]*;', Name.Entity),
- ('<!--', Comment, 'comment'),
- (r'[(|)*,?+]', Operator),
- (r'"[^"]*"', String.Double),
- (r'\'[^\']*\'', String.Single),
- ],
- 'comment': [
- ('[^-]+', Comment),
- ('-->', Comment, '#pop'),
- ('-', Comment),
- ],
- 'element': [
- include('common'),
- (r'EMPTY|ANY|#PCDATA', Keyword.Constant),
- (r'[^>\s|()?+*,]+', Name.Tag),
- (r'>', Keyword, '#pop'),
- ],
- 'attlist': [
- include('common'),
- (r'CDATA|IDREFS|IDREF|ID|NMTOKENS|NMTOKEN|ENTITIES|ENTITY|NOTATION',
- Keyword.Constant),
- (r'#REQUIRED|#IMPLIED|#FIXED', Keyword.Constant),
- (r'xml:space|xml:lang', Keyword.Reserved),
- (r'[^>\s|()?+*,]+', Name.Attribute),
- (r'>', Keyword, '#pop'),
- ],
- 'entity': [
- include('common'),
- (r'SYSTEM|PUBLIC|NDATA', Keyword.Constant),
- (r'[^>\s|()?+*,]+', Name.Entity),
- (r'>', Keyword, '#pop'),
- ],
- 'notation': [
- include('common'),
- (r'SYSTEM|PUBLIC', Keyword.Constant),
- (r'[^>\s|()?+*,]+', Name.Attribute),
- (r'>', Keyword, '#pop'),
- ],
- }
- def analyse_text(text):
- if not looks_like_xml(text) and \
- ('<!ELEMENT' in text or '<!ATTLIST' in text or '<!ENTITY' in text):
- return 0.8
- class XmlLexer(RegexLexer):
- """
- Generic lexer for XML (eXtensible Markup Language).
- """
- flags = re.MULTILINE | re.DOTALL | re.UNICODE
- name = 'XML'
- aliases = ['xml']
- filenames = ['*.xml', '*.xsl', '*.rss', '*.xslt', '*.xsd',
- '*.wsdl', '*.wsf']
- mimetypes = ['text/xml', 'application/xml', 'image/svg+xml',
- 'application/rss+xml', 'application/atom+xml']
- tokens = {
- 'root': [
- ('[^<&]+', Text),
- (r'&\S*?;', Name.Entity),
- (r'\<\!\[CDATA\[.*?\]\]\>', Comment.Preproc),
- ('<!--', Comment, 'comment'),
- (r'<\?.*?\?>', Comment.Preproc),
- ('<![^>]*>', Comment.Preproc),
- (r'<\s*[\w:.-]+', Name.Tag, 'tag'),
- (r'<\s*/\s*[\w:.-]+\s*>', Name.Tag),
- ],
- 'comment': [
- ('[^-]+', Comment),
- ('-->', Comment, '#pop'),
- ('-', Comment),
- ],
- 'tag': [
- (r'\s+', Text),
- (r'[\w.:-]+\s*=', Name.Attribute, 'attr'),
- (r'/?\s*>', Name.Tag, '#pop'),
- ],
- 'attr': [
- (r'\s+', Text),
- ('".*?"', String, '#pop'),
- ("'.*?'", String, '#pop'),
- (r'[^\s>]+', String, '#pop'),
- ],
- }
- def analyse_text(text):
- if looks_like_xml(text):
- return 0.45 # less than HTML
- class XsltLexer(XmlLexer):
- """
- A lexer for XSLT.
- .. versionadded:: 0.10
- """
- name = 'XSLT'
- aliases = ['xslt']
- filenames = ['*.xsl', '*.xslt', '*.xpl'] # xpl is XProc
- mimetypes = ['application/xsl+xml', 'application/xslt+xml']
- EXTRA_KEYWORDS = {
- 'apply-imports', 'apply-templates', 'attribute',
- 'attribute-set', 'call-template', 'choose', 'comment',
- 'copy', 'copy-of', 'decimal-format', 'element', 'fallback',
- 'for-each', 'if', 'import', 'include', 'key', 'message',
- 'namespace-alias', 'number', 'otherwise', 'output', 'param',
- 'preserve-space', 'processing-instruction', 'sort',
- 'strip-space', 'stylesheet', 'template', 'text', 'transform',
- 'value-of', 'variable', 'when', 'with-param'
- }
- def get_tokens_unprocessed(self, text):
- for index, token, value in XmlLexer.get_tokens_unprocessed(self, text):
- m = re.match('</?xsl:([^>]*)/?>?', value)
- if token is Name.Tag and m and m.group(1) in self.EXTRA_KEYWORDS:
- yield index, Keyword, value
- else:
- yield index, token, value
- def analyse_text(text):
- if looks_like_xml(text) and '<xsl' in text:
- return 0.8
- class HamlLexer(ExtendedRegexLexer):
- """
- For Haml markup.
- .. versionadded:: 1.3
- """
- name = 'Haml'
- aliases = ['haml']
- filenames = ['*.haml']
- mimetypes = ['text/x-haml']
- flags = re.IGNORECASE
- # Haml can include " |\n" anywhere,
- # which is ignored and used to wrap long lines.
- # To accomodate this, use this custom faux dot instead.
- _dot = r'(?: \|\n(?=.* \|)|.)'
- # In certain places, a comma at the end of the line
- # allows line wrapping as well.
- _comma_dot = r'(?:,\s*\n|' + _dot + ')'
- tokens = {
- 'root': [
- (r'[ \t]*\n', Text),
- (r'[ \t]*', _indentation),
- ],
- 'css': [
- (r'\.[\w:-]+', Name.Class, 'tag'),
- (r'\#[\w:-]+', Name.Function, 'tag'),
- ],
- 'eval-or-plain': [
- (r'[&!]?==', Punctuation, 'plain'),
- (r'([&!]?[=~])(' + _comma_dot + r'*\n)',
- bygroups(Punctuation, using(RubyLexer)),
- 'root'),
- default('plain'),
- ],
- 'content': [
- include('css'),
- (r'%[\w:-]+', Name.Tag, 'tag'),
- (r'!!!' + _dot + r'*\n', Name.Namespace, '#pop'),
- (r'(/)(\[' + _dot + r'*?\])(' + _dot + r'*\n)',
- bygroups(Comment, Comment.Special, Comment),
- '#pop'),
- (r'/' + _dot + r'*\n', _starts_block(Comment, 'html-comment-block'),
- '#pop'),
- (r'-#' + _dot + r'*\n', _starts_block(Comment.Preproc,
- 'haml-comment-block'), '#pop'),
- (r'(-)(' + _comma_dot + r'*\n)',
- bygroups(Punctuation, using(RubyLexer)),
- '#pop'),
- (r':' + _dot + r'*\n', _starts_block(Name.Decorator, 'filter-block'),
- '#pop'),
- include('eval-or-plain'),
- ],
- 'tag': [
- include('css'),
- (r'\{(,\n|' + _dot + r')*?\}', using(RubyLexer)),
- (r'\[' + _dot + r'*?\]', using(RubyLexer)),
- (r'\(', Text, 'html-attributes'),
- (r'/[ \t]*\n', Punctuation, '#pop:2'),
- (r'[<>]{1,2}(?=[ \t=])', Punctuation),
- include('eval-or-plain'),
- ],
- 'plain': [
- (r'([^#\n]|#[^{\n]|(\\\\)*\\#\{)+', Text),
- (r'(#\{)(' + _dot + r'*?)(\})',
- bygroups(String.Interpol, using(RubyLexer), String.Interpol)),
- (r'\n', Text, 'root'),
- ],
- 'html-attributes': [
- (r'\s+', Text),
- (r'[\w:-]+[ \t]*=', Name.Attribute, 'html-attribute-value'),
- (r'[\w:-]+', Name.Attribute),
- (r'\)', Text, '#pop'),
- ],
- 'html-attribute-value': [
- (r'[ \t]+', Text),
- (r'\w+', Name.Variable, '#pop'),
- (r'@\w+', Name.Variable.Instance, '#pop'),
- (r'\$\w+', Name.Variable.Global, '#pop'),
- (r"'(\\\\|\\[^\\]|[^'\\\n])*'", String, '#pop'),
- (r'"(\\\\|\\[^\\]|[^"\\\n])*"', String, '#pop'),
- ],
- 'html-comment-block': [
- (_dot + '+', Comment),
- (r'\n', Text, 'root'),
- ],
- 'haml-comment-block': [
- (_dot + '+', Comment.Preproc),
- (r'\n', Text, 'root'),
- ],
- 'filter-block': [
- (r'([^#\n]|#[^{\n]|(\\\\)*\\#\{)+', Name.Decorator),
- (r'(#\{)(' + _dot + r'*?)(\})',
- bygroups(String.Interpol, using(RubyLexer), String.Interpol)),
- (r'\n', Text, 'root'),
- ],
- }
- class ScamlLexer(ExtendedRegexLexer):
- """
- For `Scaml markup <http://scalate.fusesource.org/>`_. Scaml is Haml for Scala.
- .. versionadded:: 1.4
- """
- name = 'Scaml'
- aliases = ['scaml']
- filenames = ['*.scaml']
- mimetypes = ['text/x-scaml']
- flags = re.IGNORECASE
- # Scaml does not yet support the " |\n" notation to
- # wrap long lines. Once it does, use the custom faux
- # dot instead.
- # _dot = r'(?: \|\n(?=.* \|)|.)'
- _dot = r'.'
- tokens = {
- 'root': [
- (r'[ \t]*\n', Text),
- (r'[ \t]*', _indentation),
- ],
- 'css': [
- (r'\.[\w:-]+', Name.Class, 'tag'),
- (r'\#[\w:-]+', Name.Function, 'tag'),
- ],
- 'eval-or-plain': [
- (r'[&!]?==', Punctuation, 'plain'),
- (r'([&!]?[=~])(' + _dot + r'*\n)',
- bygroups(Punctuation, using(ScalaLexer)),
- 'root'),
- default('plain'),
- ],
- 'content': [
- include('css'),
- (r'%[\w:-]+', Name.Tag, 'tag'),
- (r'!!!' + _dot + r'*\n', Name.Namespace, '#pop'),
- (r'(/)(\[' + _dot + r'*?\])(' + _dot + r'*\n)',
- bygroups(Comment, Comment.Special, Comment),
- '#pop'),
- (r'/' + _dot + r'*\n', _starts_block(Comment, 'html-comment-block'),
- '#pop'),
- (r'-#' + _dot + r'*\n', _starts_block(Comment.Preproc,
- 'scaml-comment-block'), '#pop'),
- (r'(-@\s*)(import)?(' + _dot + r'*\n)',
- bygroups(Punctuation, Keyword, using(ScalaLexer)),
- '#pop'),
- (r'(-)(' + _dot + r'*\n)',
- bygroups(Punctuation, using(ScalaLexer)),
- '#pop'),
- (r':' + _dot + r'*\n', _starts_block(Name.Decorator, 'filter-block'),
- '#pop'),
- include('eval-or-plain'),
- ],
- 'tag': [
- include('css'),
- (r'\{(,\n|' + _dot + r')*?\}', using(ScalaLexer)),
- (r'\[' + _dot + r'*?\]', using(ScalaLexer)),
- (r'\(', Text, 'html-attributes'),
- (r'/[ \t]*\n', Punctuation, '#pop:2'),
- (r'[<>]{1,2}(?=[ \t=])', Punctuation),
- include('eval-or-plain'),
- ],
- 'plain': [
- (r'([^#\n]|#[^{\n]|(\\\\)*\\#\{)+', Text),
- (r'(#\{)(' + _dot + r'*?)(\})',
- bygroups(String.Interpol, using(ScalaLexer), String.Interpol)),
- (r'\n', Text, 'root'),
- ],
- 'html-attributes': [
- (r'\s+', Text),
- (r'[\w:-]+[ \t]*=', Name.Attribute, 'html-attribute-value'),
- (r'[\w:-]+', Name.Attribute),
- (r'\)', Text, '#pop'),
- ],
- 'html-attribute-value': [
- (r'[ \t]+', Text),
- (r'\w+', Name.Variable, '#pop'),
- (r'@\w+', Name.Variable.Instance, '#pop'),
- (r'\$\w+', Name.Variable.Global, '#pop'),
- (r"'(\\\\|\\[^\\]|[^'\\\n])*'", String, '#pop'),
- (r'"(\\\\|\\[^\\]|[^"\\\n])*"', String, '#pop'),
- ],
- 'html-comment-block': [
- (_dot + '+', Comment),
- (r'\n', Text, 'root'),
- ],
- 'scaml-comment-block': [
- (_dot + '+', Comment.Preproc),
- (r'\n', Text, 'root'),
- ],
- 'filter-block': [
- (r'([^#\n]|#[^{\n]|(\\\\)*\\#\{)+', Name.Decorator),
- (r'(#\{)(' + _dot + r'*?)(\})',
- bygroups(String.Interpol, using(ScalaLexer), String.Interpol)),
- (r'\n', Text, 'root'),
- ],
- }
- class PugLexer(ExtendedRegexLexer):
- """
- For Pug markup.
- Pug is a variant of Scaml, see:
- http://scalate.fusesource.org/documentation/scaml-reference.html
- .. versionadded:: 1.4
- """
- name = 'Pug'
- aliases = ['pug', 'jade']
- filenames = ['*.pug', '*.jade']
- mimetypes = ['text/x-pug', 'text/x-jade']
- flags = re.IGNORECASE
- _dot = r'.'
- tokens = {
- 'root': [
- (r'[ \t]*\n', Text),
- (r'[ \t]*', _indentation),
- ],
- 'css': [
- (r'\.[\w:-]+', Name.Class, 'tag'),
- (r'\#[\w:-]+', Name.Function, 'tag'),
- ],
- 'eval-or-plain': [
- (r'[&!]?==', Punctuation, 'plain'),
- (r'([&!]?[=~])(' + _dot + r'*\n)',
- bygroups(Punctuation, using(ScalaLexer)), 'root'),
- default('plain'),
- ],
- 'content': [
- include('css'),
- (r'!!!' + _dot + r'*\n', Name.Namespace, '#pop'),
- (r'(/)(\[' + _dot + r'*?\])(' + _dot + r'*\n)',
- bygroups(Comment, Comment.Special, Comment),
- '#pop'),
- (r'/' + _dot + r'*\n', _starts_block(Comment, 'html-comment-block'),
- '#pop'),
- (r'-#' + _dot + r'*\n', _starts_block(Comment.Preproc,
- 'scaml-comment-block'), '#pop'),
- (r'(-@\s*)(import)?(' + _dot + r'*\n)',
- bygroups(Punctuation, Keyword, using(ScalaLexer)),
- '#pop'),
- (r'(-)(' + _dot + r'*\n)',
- bygroups(Punctuation, using(ScalaLexer)),
- '#pop'),
- (r':' + _dot + r'*\n', _starts_block(Name.Decorator, 'filter-block'),
- '#pop'),
- (r'[\w:-]+', Name.Tag, 'tag'),
- (r'\|', Text, 'eval-or-plain'),
- ],
- 'tag': [
- include('css'),
- (r'\{(,\n|' + _dot + r')*?\}', using(ScalaLexer)),
- (r'\[' + _dot + r'*?\]', using(ScalaLexer)),
- (r'\(', Text, 'html-attributes'),
- (r'/[ \t]*\n', Punctuation, '#pop:2'),
- (r'[<>]{1,2}(?=[ \t=])', Punctuation),
- include('eval-or-plain'),
- ],
- 'plain': [
- (r'([^#\n]|#[^{\n]|(\\\\)*\\#\{)+', Text),
- (r'(#\{)(' + _dot + r'*?)(\})',
- bygroups(String.Interpol, using(ScalaLexer), String.Interpol)),
- (r'\n', Text, 'root'),
- ],
- 'html-attributes': [
- (r'\s+', Text),
- (r'[\w:-]+[ \t]*=', Name.Attribute, 'html-attribute-value'),
- (r'[\w:-]+', Name.Attribute),
- (r'\)', Text, '#pop'),
- ],
- 'html-attribute-value': [
- (r'[ \t]+', Text),
- (r'\w+', Name.Variable, '#pop'),
- (r'@\w+', Name.Variable.Instance, '#pop'),
- (r'\$\w+', Name.Variable.Global, '#pop'),
- (r"'(\\\\|\\[^\\]|[^'\\\n])*'", String, '#pop'),
- (r'"(\\\\|\\[^\\]|[^"\\\n])*"', String, '#pop'),
- ],
- 'html-comment-block': [
- (_dot + '+', Comment),
- (r'\n', Text, 'root'),
- ],
- 'scaml-comment-block': [
- (_dot + '+', Comment.Preproc),
- (r'\n', Text, 'root'),
- ],
- 'filter-block': [
- (r'([^#\n]|#[^{\n]|(\\\\)*\\#\{)+', Name.Decorator),
- (r'(#\{)(' + _dot + r'*?)(\})',
- bygroups(String.Interpol, using(ScalaLexer), String.Interpol)),
- (r'\n', Text, 'root'),
- ],
- }
- JadeLexer = PugLexer # compat
|