123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029 |
- <!DOCTYPE html>
- <html>
- <head>
- <meta charset="utf-8" />
- <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" />
- <meta property="og:title" content="Unicode HOWTO" />
- <meta property="og:type" content="website" />
- <meta property="og:url" content="https://docs.python.org/3/howto/unicode.html" />
- <meta property="og:site_name" content="Python documentation" />
- <meta property="og:description" content="Release, 1.12,. This HOWTO discusses Python’s support for the Unicode specification for representing textual data, and explains various problems that people commonly encounter when trying to work w..." />
- <meta property="og:image" content="https://docs.python.org/3/_static/og-image.png" />
- <meta property="og:image:alt" content="Python documentation" />
- <meta name="description" content="Release, 1.12,. This HOWTO discusses Python’s support for the Unicode specification for representing textual data, and explains various problems that people commonly encounter when trying to work w..." />
- <meta property="og:image:width" content="200" />
- <meta property="og:image:height" content="200" />
- <meta name="theme-color" content="#3776ab" />
- <title>Unicode HOWTO — Python 3.12.0 documentation</title><meta name="viewport" content="width=device-width, initial-scale=1.0">
-
- <link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
- <link rel="stylesheet" type="text/css" href="../_static/pydoctheme.css?digest=b37c26da2f7529d09fe70b41c4b2133fe4931a90" />
- <link id="pygments_dark_css" media="(prefers-color-scheme: dark)" rel="stylesheet" type="text/css" href="../_static/pygments_dark.css" />
-
- <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
- <script src="../_static/jquery.js"></script>
- <script src="../_static/underscore.js"></script>
- <script src="../_static/doctools.js"></script>
-
- <script src="../_static/sidebar.js"></script>
-
- <link rel="search" type="application/opensearchdescription+xml"
- title="Search within Python 3.12.0 documentation"
- href="../_static/opensearch.xml"/>
- <link rel="author" title="About these documents" href="../about.html" />
- <link rel="index" title="Index" href="../genindex.html" />
- <link rel="search" title="Search" href="../search.html" />
- <link rel="copyright" title="Copyright" href="../copyright.html" />
- <link rel="next" title="HOWTO Fetch Internet Resources Using The urllib Package" href="urllib2.html" />
- <link rel="prev" title="Sorting HOW TO" href="sorting.html" />
- <link rel="canonical" href="https://docs.python.org/3/howto/unicode.html" />
-
-
-
-
- <style>
- @media only screen {
- table.full-width-table {
- width: 100%;
- }
- }
- </style>
- <link rel="stylesheet" href="../_static/pydoctheme_dark.css" media="(prefers-color-scheme: dark)" id="pydoctheme_dark_css">
- <link rel="shortcut icon" type="image/png" href="../_static/py.svg" />
- <script type="text/javascript" src="../_static/copybutton.js"></script>
- <script type="text/javascript" src="../_static/menu.js"></script>
- <script type="text/javascript" src="../_static/themetoggle.js"></script>
- </head>
- <body>
- <div class="mobile-nav">
- <input type="checkbox" id="menuToggler" class="toggler__input" aria-controls="navigation"
- aria-pressed="false" aria-expanded="false" role="button" aria-label="Menu" />
- <nav class="nav-content" role="navigation">
- <label for="menuToggler" class="toggler__label">
- <span></span>
- </label>
- <span class="nav-items-wrapper">
- <a href="https://www.python.org/" class="nav-logo">
- <img src="../_static/py.svg" alt="Logo"/>
- </a>
- <span class="version_switcher_placeholder"></span>
- <form role="search" class="search" action="../search.html" method="get">
- <svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" class="search-icon">
- <path fill-rule="nonzero" fill="currentColor" d="M15.5 14h-.79l-.28-.27a6.5 6.5 0 001.48-5.34c-.47-2.78-2.79-5-5.59-5.34a6.505 6.505 0 00-7.27 7.27c.34 2.8 2.56 5.12 5.34 5.59a6.5 6.5 0 005.34-1.48l.27.28v.79l4.25 4.25c.41.41 1.08.41 1.49 0 .41-.41.41-1.08 0-1.49L15.5 14zm-6 0C7.01 14 5 11.99 5 9.5S7.01 5 9.5 5 14 7.01 14 9.5 11.99 14 9.5 14z"></path>
- </svg>
- <input placeholder="Quick search" aria-label="Quick search" type="search" name="q" />
- <input type="submit" value="Go"/>
- </form>
- </span>
- </nav>
- <div class="menu-wrapper">
- <nav class="menu" role="navigation" aria-label="main navigation">
- <div class="language_switcher_placeholder"></div>
-
- <label class="theme-selector-label">
- Theme
- <select class="theme-selector" oninput="activateTheme(this.value)">
- <option value="auto" selected>Auto</option>
- <option value="light">Light</option>
- <option value="dark">Dark</option>
- </select>
- </label>
- <div>
- <h3><a href="../contents.html">Table of Contents</a></h3>
- <ul>
- <li><a class="reference internal" href="#">Unicode HOWTO</a><ul>
- <li><a class="reference internal" href="#introduction-to-unicode">Introduction to Unicode</a><ul>
- <li><a class="reference internal" href="#definitions">Definitions</a></li>
- <li><a class="reference internal" href="#encodings">Encodings</a></li>
- <li><a class="reference internal" href="#references">References</a></li>
- </ul>
- </li>
- <li><a class="reference internal" href="#python-s-unicode-support">Python’s Unicode Support</a><ul>
- <li><a class="reference internal" href="#the-string-type">The String Type</a></li>
- <li><a class="reference internal" href="#converting-to-bytes">Converting to Bytes</a></li>
- <li><a class="reference internal" href="#unicode-literals-in-python-source-code">Unicode Literals in Python Source Code</a></li>
- <li><a class="reference internal" href="#unicode-properties">Unicode Properties</a></li>
- <li><a class="reference internal" href="#comparing-strings">Comparing Strings</a></li>
- <li><a class="reference internal" href="#unicode-regular-expressions">Unicode Regular Expressions</a></li>
- <li><a class="reference internal" href="#id2">References</a></li>
- </ul>
- </li>
- <li><a class="reference internal" href="#reading-and-writing-unicode-data">Reading and Writing Unicode Data</a><ul>
- <li><a class="reference internal" href="#unicode-filenames">Unicode filenames</a></li>
- <li><a class="reference internal" href="#tips-for-writing-unicode-aware-programs">Tips for Writing Unicode-aware Programs</a><ul>
- <li><a class="reference internal" href="#converting-between-file-encodings">Converting Between File Encodings</a></li>
- <li><a class="reference internal" href="#files-in-an-unknown-encoding">Files in an Unknown Encoding</a></li>
- </ul>
- </li>
- <li><a class="reference internal" href="#id3">References</a></li>
- </ul>
- </li>
- <li><a class="reference internal" href="#acknowledgements">Acknowledgements</a></li>
- </ul>
- </li>
- </ul>
- </div>
- <div>
- <h4>Previous topic</h4>
- <p class="topless"><a href="sorting.html"
- title="previous chapter">Sorting HOW TO</a></p>
- </div>
- <div>
- <h4>Next topic</h4>
- <p class="topless"><a href="urllib2.html"
- title="next chapter">HOWTO Fetch Internet Resources Using The urllib Package</a></p>
- </div>
- <div role="note" aria-label="source link">
- <h3>This Page</h3>
- <ul class="this-page-menu">
- <li><a href="../bugs.html">Report a Bug</a></li>
- <li>
- <a href="https://github.com/python/cpython/blob/main/Doc/howto/unicode.rst"
- rel="nofollow">Show Source
- </a>
- </li>
- </ul>
- </div>
- </nav>
- </div>
- </div>
-
- <div class="related" role="navigation" aria-label="related navigation">
- <h3>Navigation</h3>
- <ul>
- <li class="right" style="margin-right: 10px">
- <a href="../genindex.html" title="General Index"
- accesskey="I">index</a></li>
- <li class="right" >
- <a href="../py-modindex.html" title="Python Module Index"
- >modules</a> |</li>
- <li class="right" >
- <a href="urllib2.html" title="HOWTO Fetch Internet Resources Using The urllib Package"
- accesskey="N">next</a> |</li>
- <li class="right" >
- <a href="sorting.html" title="Sorting HOW TO"
- accesskey="P">previous</a> |</li>
- <li><img src="../_static/py.svg" alt="python logo" style="vertical-align: middle; margin-top: -1px"/></li>
- <li><a href="https://www.python.org/">Python</a> »</li>
- <li class="switchers">
- <div class="language_switcher_placeholder"></div>
- <div class="version_switcher_placeholder"></div>
- </li>
- <li>
-
- </li>
- <li id="cpython-language-and-version">
- <a href="../index.html">3.12.0 Documentation</a> »
- </li>
- <li class="nav-item nav-item-1"><a href="index.html" accesskey="U">Python HOWTOs</a> »</li>
- <li class="nav-item nav-item-this"><a href="">Unicode HOWTO</a></li>
- <li class="right">
-
- <div class="inline-search" role="search">
- <form class="inline-search" action="../search.html" method="get">
- <input placeholder="Quick search" aria-label="Quick search" type="search" name="q" />
- <input type="submit" value="Go" />
- </form>
- </div>
- |
- </li>
- <li class="right">
- <label class="theme-selector-label">
- Theme
- <select class="theme-selector" oninput="activateTheme(this.value)">
- <option value="auto" selected>Auto</option>
- <option value="light">Light</option>
- <option value="dark">Dark</option>
- </select>
- </label> |</li>
-
- </ul>
- </div>
- <div class="document">
- <div class="documentwrapper">
- <div class="bodywrapper">
- <div class="body" role="main">
-
- <section id="unicode-howto">
- <span id="id1"></span><h1>Unicode HOWTO<a class="headerlink" href="#unicode-howto" title="Permalink to this headline">¶</a></h1>
- <dl class="field-list simple">
- <dt class="field-odd">Release</dt>
- <dd class="field-odd"><p>1.12</p>
- </dd>
- </dl>
- <p>This HOWTO discusses Python’s support for the Unicode specification
- for representing textual data, and explains various problems that
- people commonly encounter when trying to work with Unicode.</p>
- <section id="introduction-to-unicode">
- <h2>Introduction to Unicode<a class="headerlink" href="#introduction-to-unicode" title="Permalink to this headline">¶</a></h2>
- <section id="definitions">
- <h3>Definitions<a class="headerlink" href="#definitions" title="Permalink to this headline">¶</a></h3>
- <p>Today’s programs need to be able to handle a wide variety of
- characters. Applications are often internationalized to display
- messages and output in a variety of user-selectable languages; the
- same program might need to output an error message in English, French,
- Japanese, Hebrew, or Russian. Web content can be written in any of
- these languages and can also include a variety of emoji symbols.
- Python’s string type uses the Unicode Standard for representing
- characters, which lets Python programs work with all these different
- possible characters.</p>
- <p>Unicode (<a class="reference external" href="https://www.unicode.org/">https://www.unicode.org/</a>) is a specification that aims to
- list every character used by human languages and give each character
- its own unique code. The Unicode specifications are continually
- revised and updated to add new languages and symbols.</p>
- <p>A <strong>character</strong> is the smallest possible component of a text. ‘A’, ‘B’, ‘C’,
- etc., are all different characters. So are ‘È’ and ‘Í’. Characters vary
- depending on the language or context you’re talking
- about. For example, there’s a character for “Roman Numeral One”, ‘Ⅰ’, that’s
- separate from the uppercase letter ‘I’. They’ll usually look the same,
- but these are two different characters that have different meanings.</p>
- <p>The Unicode standard describes how characters are represented by
- <strong>code points</strong>. A code point value is an integer in the range 0 to
- 0x10FFFF (about 1.1 million values, the
- <a class="reference external" href="https://www.unicode.org/versions/latest/#Summary">actual number assigned</a>
- is less than that). In the standard and in this document, a code point is written
- using the notation <code class="docutils literal notranslate"><span class="pre">U+265E</span></code> to mean the character with value
- <code class="docutils literal notranslate"><span class="pre">0x265e</span></code> (9,822 in decimal).</p>
- <p>The Unicode standard contains a lot of tables listing characters and
- their corresponding code points:</p>
- <div class="highlight-none notranslate"><div class="highlight"><pre><span></span>0061 'a'; LATIN SMALL LETTER A
- 0062 'b'; LATIN SMALL LETTER B
- 0063 'c'; LATIN SMALL LETTER C
- ...
- 007B '{'; LEFT CURLY BRACKET
- ...
- 2167 'Ⅷ'; ROMAN NUMERAL EIGHT
- 2168 'Ⅸ'; ROMAN NUMERAL NINE
- ...
- 265E '♞'; BLACK CHESS KNIGHT
- 265F '♟'; BLACK CHESS PAWN
- ...
- 1F600 '😀'; GRINNING FACE
- 1F609 '😉'; WINKING FACE
- ...
- </pre></div>
- </div>
- <p>Strictly, these definitions imply that it’s meaningless to say ‘this is
- character <code class="docutils literal notranslate"><span class="pre">U+265E</span></code>’. <code class="docutils literal notranslate"><span class="pre">U+265E</span></code> is a code point, which represents some particular
- character; in this case, it represents the character ‘BLACK CHESS KNIGHT’,
- ‘♞’. In
- informal contexts, this distinction between code points and characters will
- sometimes be forgotten.</p>
- <p>A character is represented on a screen or on paper by a set of graphical
- elements that’s called a <strong>glyph</strong>. The glyph for an uppercase A, for example,
- is two diagonal strokes and a horizontal stroke, though the exact details will
- depend on the font being used. Most Python code doesn’t need to worry about
- glyphs; figuring out the correct glyph to display is generally the job of a GUI
- toolkit or a terminal’s font renderer.</p>
- </section>
- <section id="encodings">
- <h3>Encodings<a class="headerlink" href="#encodings" title="Permalink to this headline">¶</a></h3>
- <p>To summarize the previous section: a Unicode string is a sequence of
- code points, which are numbers from 0 through <code class="docutils literal notranslate"><span class="pre">0x10FFFF</span></code> (1,114,111
- decimal). This sequence of code points needs to be represented in
- memory as a set of <strong>code units</strong>, and <strong>code units</strong> are then mapped
- to 8-bit bytes. The rules for translating a Unicode string into a
- sequence of bytes are called a <strong>character encoding</strong>, or just
- an <strong>encoding</strong>.</p>
- <p>The first encoding you might think of is using 32-bit integers as the
- code unit, and then using the CPU’s representation of 32-bit integers.
- In this representation, the string “Python” might look like this:</p>
- <div class="highlight-none notranslate"><div class="highlight"><pre><span></span> P y t h o n
- 0x50 00 00 00 79 00 00 00 74 00 00 00 68 00 00 00 6f 00 00 00 6e 00 00 00
- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
- </pre></div>
- </div>
- <p>This representation is straightforward but using it presents a number of
- problems.</p>
- <ol class="arabic simple">
- <li><p>It’s not portable; different processors order the bytes differently.</p></li>
- <li><p>It’s very wasteful of space. In most texts, the majority of the code points
- are less than 127, or less than 255, so a lot of space is occupied by <code class="docutils literal notranslate"><span class="pre">0x00</span></code>
- bytes. The above string takes 24 bytes compared to the 6 bytes needed for an
- ASCII representation. Increased RAM usage doesn’t matter too much (desktop
- computers have gigabytes of RAM, and strings aren’t usually that large), but
- expanding our usage of disk and network bandwidth by a factor of 4 is
- intolerable.</p></li>
- <li><p>It’s not compatible with existing C functions such as <code class="docutils literal notranslate"><span class="pre">strlen()</span></code>, so a new
- family of wide string functions would need to be used.</p></li>
- </ol>
- <p>Therefore this encoding isn’t used very much, and people instead choose other
- encodings that are more efficient and convenient, such as UTF-8.</p>
- <p>UTF-8 is one of the most commonly used encodings, and Python often
- defaults to using it. UTF stands for “Unicode Transformation Format”,
- and the ‘8’ means that 8-bit values are used in the encoding. (There
- are also UTF-16 and UTF-32 encodings, but they are less frequently
- used than UTF-8.) UTF-8 uses the following rules:</p>
- <ol class="arabic simple">
- <li><p>If the code point is < 128, it’s represented by the corresponding byte value.</p></li>
- <li><p>If the code point is >= 128, it’s turned into a sequence of two, three, or
- four bytes, where each byte of the sequence is between 128 and 255.</p></li>
- </ol>
- <p>UTF-8 has several convenient properties:</p>
- <ol class="arabic simple">
- <li><p>It can handle any Unicode code point.</p></li>
- <li><p>A Unicode string is turned into a sequence of bytes that contains embedded
- zero bytes only where they represent the null character (U+0000). This means
- that UTF-8 strings can be processed by C functions such as <code class="docutils literal notranslate"><span class="pre">strcpy()</span></code> and sent
- through protocols that can’t handle zero bytes for anything other than
- end-of-string markers.</p></li>
- <li><p>A string of ASCII text is also valid UTF-8 text.</p></li>
- <li><p>UTF-8 is fairly compact; the majority of commonly used characters can be
- represented with one or two bytes.</p></li>
- <li><p>If bytes are corrupted or lost, it’s possible to determine the start of the
- next UTF-8-encoded code point and resynchronize. It’s also unlikely that
- random 8-bit data will look like valid UTF-8.</p></li>
- <li><p>UTF-8 is a byte oriented encoding. The encoding specifies that each
- character is represented by a specific sequence of one or more bytes. This
- avoids the byte-ordering issues that can occur with integer and word oriented
- encodings, like UTF-16 and UTF-32, where the sequence of bytes varies depending
- on the hardware on which the string was encoded.</p></li>
- </ol>
- </section>
- <section id="references">
- <h3>References<a class="headerlink" href="#references" title="Permalink to this headline">¶</a></h3>
- <p>The <a class="reference external" href="https://www.unicode.org">Unicode Consortium site</a> has character charts, a
- glossary, and PDF versions of the Unicode specification. Be prepared for some
- difficult reading. <a class="reference external" href="https://www.unicode.org/history/">A chronology</a> of the
- origin and development of Unicode is also available on the site.</p>
- <p>On the Computerphile Youtube channel, Tom Scott briefly
- <a class="reference external" href="https://www.youtube.com/watch?v=MijmeoH9LT4">discusses the history of Unicode and UTF-8</a>
- (9 minutes 36 seconds).</p>
- <p>To help understand the standard, Jukka Korpela has written <a class="reference external" href="https://jkorpela.fi/unicode/guide.html">an introductory
- guide</a> to reading the
- Unicode character tables.</p>
- <p>Another <a class="reference external" href="https://www.joelonsoftware.com/2003/10/08/the-absolute-minimum-every-software-developer-absolutely-positively-must-know-about-unicode-and-character-sets-no-excuses/">good introductory article</a>
- was written by Joel Spolsky.
- If this introduction didn’t make things clear to you, you should try
- reading this alternate article before continuing.</p>
- <p>Wikipedia entries are often helpful; see the entries for “<a class="reference external" href="https://en.wikipedia.org/wiki/Character_encoding">character encoding</a>” and <a class="reference external" href="https://en.wikipedia.org/wiki/UTF-8">UTF-8</a>, for example.</p>
- </section>
- </section>
- <section id="python-s-unicode-support">
- <h2>Python’s Unicode Support<a class="headerlink" href="#python-s-unicode-support" title="Permalink to this headline">¶</a></h2>
- <p>Now that you’ve learned the rudiments of Unicode, we can look at Python’s
- Unicode features.</p>
- <section id="the-string-type">
- <h3>The String Type<a class="headerlink" href="#the-string-type" title="Permalink to this headline">¶</a></h3>
- <p>Since Python 3.0, the language’s <a class="reference internal" href="../library/stdtypes.html#str" title="str"><code class="xref py py-class docutils literal notranslate"><span class="pre">str</span></code></a> type contains Unicode
- characters, meaning any string created using <code class="docutils literal notranslate"><span class="pre">"unicode</span> <span class="pre">rocks!"</span></code>, <code class="docutils literal notranslate"><span class="pre">'unicode</span>
- <span class="pre">rocks!'</span></code>, or the triple-quoted string syntax is stored as Unicode.</p>
- <p>The default encoding for Python source code is UTF-8, so you can simply
- include a Unicode character in a string literal:</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="k">try</span><span class="p">:</span>
- <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="s1">'/tmp/input.txt'</span><span class="p">,</span> <span class="s1">'r'</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
- <span class="o">...</span>
- <span class="k">except</span> <span class="ne">OSError</span><span class="p">:</span>
- <span class="c1"># 'File not found' error message.</span>
- <span class="nb">print</span><span class="p">(</span><span class="s2">"Fichier non trouvé"</span><span class="p">)</span>
- </pre></div>
- </div>
- <p>Side note: Python 3 also supports using Unicode characters in identifiers:</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="n">répertoire</span> <span class="o">=</span> <span class="s2">"/tmp/records.log"</span>
- <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">répertoire</span><span class="p">,</span> <span class="s2">"w"</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
- <span class="n">f</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s2">"test</span><span class="se">\n</span><span class="s2">"</span><span class="p">)</span>
- </pre></div>
- </div>
- <p>If you can’t enter a particular character in your editor or want to
- keep the source code ASCII-only for some reason, you can also use
- escape sequences in string literals. (Depending on your system,
- you may see the actual capital-delta glyph instead of a u escape.)</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="s2">"</span><span class="se">\N{GREEK CAPITAL LETTER DELTA}</span><span class="s2">"</span> <span class="c1"># Using the character name</span>
- <span class="go">'\u0394'</span>
- <span class="gp">>>> </span><span class="s2">"</span><span class="se">\u0394</span><span class="s2">"</span> <span class="c1"># Using a 16-bit hex value</span>
- <span class="go">'\u0394'</span>
- <span class="gp">>>> </span><span class="s2">"</span><span class="se">\U00000394</span><span class="s2">"</span> <span class="c1"># Using a 32-bit hex value</span>
- <span class="go">'\u0394'</span>
- </pre></div>
- </div>
- <p>In addition, one can create a string using the <a class="reference internal" href="../library/stdtypes.html#bytes.decode" title="bytes.decode"><code class="xref py py-func docutils literal notranslate"><span class="pre">decode()</span></code></a> method of
- <a class="reference internal" href="../library/stdtypes.html#bytes" title="bytes"><code class="xref py py-class docutils literal notranslate"><span class="pre">bytes</span></code></a>. This method takes an <em>encoding</em> argument, such as <code class="docutils literal notranslate"><span class="pre">UTF-8</span></code>,
- and optionally an <em>errors</em> argument.</p>
- <p>The <em>errors</em> argument specifies the response when the input string can’t be
- converted according to the encoding’s rules. Legal values for this argument are
- <code class="docutils literal notranslate"><span class="pre">'strict'</span></code> (raise a <a class="reference internal" href="../library/exceptions.html#UnicodeDecodeError" title="UnicodeDecodeError"><code class="xref py py-exc docutils literal notranslate"><span class="pre">UnicodeDecodeError</span></code></a> exception), <code class="docutils literal notranslate"><span class="pre">'replace'</span></code> (use
- <code class="docutils literal notranslate"><span class="pre">U+FFFD</span></code>, <code class="docutils literal notranslate"><span class="pre">REPLACEMENT</span> <span class="pre">CHARACTER</span></code>), <code class="docutils literal notranslate"><span class="pre">'ignore'</span></code> (just leave the
- character out of the Unicode result), or <code class="docutils literal notranslate"><span class="pre">'backslashreplace'</span></code> (inserts a
- <code class="docutils literal notranslate"><span class="pre">\xNN</span></code> escape sequence).
- The following examples show the differences:</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="sa">b</span><span class="s1">'</span><span class="se">\x80</span><span class="s1">abc'</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s2">"utf-8"</span><span class="p">,</span> <span class="s2">"strict"</span><span class="p">)</span>
- <span class="gt">Traceback (most recent call last):</span>
- <span class="w"> </span><span class="o">...</span>
- <span class="gr">UnicodeDecodeError</span>: <span class="n">'utf-8' codec can't decode byte 0x80 in position 0:</span>
- <span class="x"> invalid start byte</span>
- <span class="gp">>>> </span><span class="sa">b</span><span class="s1">'</span><span class="se">\x80</span><span class="s1">abc'</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s2">"utf-8"</span><span class="p">,</span> <span class="s2">"replace"</span><span class="p">)</span>
- <span class="go">'\ufffdabc'</span>
- <span class="gp">>>> </span><span class="sa">b</span><span class="s1">'</span><span class="se">\x80</span><span class="s1">abc'</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s2">"utf-8"</span><span class="p">,</span> <span class="s2">"backslashreplace"</span><span class="p">)</span>
- <span class="go">'\\x80abc'</span>
- <span class="gp">>>> </span><span class="sa">b</span><span class="s1">'</span><span class="se">\x80</span><span class="s1">abc'</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s2">"utf-8"</span><span class="p">,</span> <span class="s2">"ignore"</span><span class="p">)</span>
- <span class="go">'abc'</span>
- </pre></div>
- </div>
- <p>Encodings are specified as strings containing the encoding’s name. Python
- comes with roughly 100 different encodings; see the Python Library Reference at
- <a class="reference internal" href="../library/codecs.html#standard-encodings"><span class="std std-ref">Standard Encodings</span></a> for a list. Some encodings have multiple names; for
- example, <code class="docutils literal notranslate"><span class="pre">'latin-1'</span></code>, <code class="docutils literal notranslate"><span class="pre">'iso_8859_1'</span></code> and <code class="docutils literal notranslate"><span class="pre">'8859</span></code>’ are all synonyms for
- the same encoding.</p>
- <p>One-character Unicode strings can also be created with the <a class="reference internal" href="../library/functions.html#chr" title="chr"><code class="xref py py-func docutils literal notranslate"><span class="pre">chr()</span></code></a>
- built-in function, which takes integers and returns a Unicode string of length 1
- that contains the corresponding code point. The reverse operation is the
- built-in <a class="reference internal" href="../library/functions.html#ord" title="ord"><code class="xref py py-func docutils literal notranslate"><span class="pre">ord()</span></code></a> function that takes a one-character Unicode string and
- returns the code point value:</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="nb">chr</span><span class="p">(</span><span class="mi">57344</span><span class="p">)</span>
- <span class="go">'\ue000'</span>
- <span class="gp">>>> </span><span class="nb">ord</span><span class="p">(</span><span class="s1">'</span><span class="se">\ue000</span><span class="s1">'</span><span class="p">)</span>
- <span class="go">57344</span>
- </pre></div>
- </div>
- </section>
- <section id="converting-to-bytes">
- <h3>Converting to Bytes<a class="headerlink" href="#converting-to-bytes" title="Permalink to this headline">¶</a></h3>
- <p>The opposite method of <a class="reference internal" href="../library/stdtypes.html#bytes.decode" title="bytes.decode"><code class="xref py py-meth docutils literal notranslate"><span class="pre">bytes.decode()</span></code></a> is <a class="reference internal" href="../library/stdtypes.html#str.encode" title="str.encode"><code class="xref py py-meth docutils literal notranslate"><span class="pre">str.encode()</span></code></a>,
- which returns a <a class="reference internal" href="../library/stdtypes.html#bytes" title="bytes"><code class="xref py py-class docutils literal notranslate"><span class="pre">bytes</span></code></a> representation of the Unicode string, encoded in the
- requested <em>encoding</em>.</p>
- <p>The <em>errors</em> parameter is the same as the parameter of the
- <a class="reference internal" href="../library/stdtypes.html#bytes.decode" title="bytes.decode"><code class="xref py py-meth docutils literal notranslate"><span class="pre">decode()</span></code></a> method but supports a few more possible handlers. As well as
- <code class="docutils literal notranslate"><span class="pre">'strict'</span></code>, <code class="docutils literal notranslate"><span class="pre">'ignore'</span></code>, and <code class="docutils literal notranslate"><span class="pre">'replace'</span></code> (which in this case
- inserts a question mark instead of the unencodable character), there is
- also <code class="docutils literal notranslate"><span class="pre">'xmlcharrefreplace'</span></code> (inserts an XML character reference),
- <code class="docutils literal notranslate"><span class="pre">backslashreplace</span></code> (inserts a <code class="docutils literal notranslate"><span class="pre">\uNNNN</span></code> escape sequence) and
- <code class="docutils literal notranslate"><span class="pre">namereplace</span></code> (inserts a <code class="docutils literal notranslate"><span class="pre">\N{...}</span></code> escape sequence).</p>
- <p>The following example shows the different results:</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">u</span> <span class="o">=</span> <span class="nb">chr</span><span class="p">(</span><span class="mi">40960</span><span class="p">)</span> <span class="o">+</span> <span class="s1">'abcd'</span> <span class="o">+</span> <span class="nb">chr</span><span class="p">(</span><span class="mi">1972</span><span class="p">)</span>
- <span class="gp">>>> </span><span class="n">u</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s1">'utf-8'</span><span class="p">)</span>
- <span class="go">b'\xea\x80\x80abcd\xde\xb4'</span>
- <span class="gp">>>> </span><span class="n">u</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s1">'ascii'</span><span class="p">)</span>
- <span class="gt">Traceback (most recent call last):</span>
- <span class="w"> </span><span class="o">...</span>
- <span class="gr">UnicodeEncodeError</span>: <span class="n">'ascii' codec can't encode character '\ua000' in</span>
- <span class="x"> position 0: ordinal not in range(128)</span>
- <span class="gp">>>> </span><span class="n">u</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s1">'ascii'</span><span class="p">,</span> <span class="s1">'ignore'</span><span class="p">)</span>
- <span class="go">b'abcd'</span>
- <span class="gp">>>> </span><span class="n">u</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s1">'ascii'</span><span class="p">,</span> <span class="s1">'replace'</span><span class="p">)</span>
- <span class="go">b'?abcd?'</span>
- <span class="gp">>>> </span><span class="n">u</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s1">'ascii'</span><span class="p">,</span> <span class="s1">'xmlcharrefreplace'</span><span class="p">)</span>
- <span class="go">b'&#40960;abcd&#1972;'</span>
- <span class="gp">>>> </span><span class="n">u</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s1">'ascii'</span><span class="p">,</span> <span class="s1">'backslashreplace'</span><span class="p">)</span>
- <span class="go">b'\\ua000abcd\\u07b4'</span>
- <span class="gp">>>> </span><span class="n">u</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s1">'ascii'</span><span class="p">,</span> <span class="s1">'namereplace'</span><span class="p">)</span>
- <span class="go">b'\\N{YI SYLLABLE IT}abcd\\u07b4'</span>
- </pre></div>
- </div>
- <p>The low-level routines for registering and accessing the available
- encodings are found in the <a class="reference internal" href="../library/codecs.html#module-codecs" title="codecs: Encode and decode data and streams."><code class="xref py py-mod docutils literal notranslate"><span class="pre">codecs</span></code></a> module. Implementing new
- encodings also requires understanding the <a class="reference internal" href="../library/codecs.html#module-codecs" title="codecs: Encode and decode data and streams."><code class="xref py py-mod docutils literal notranslate"><span class="pre">codecs</span></code></a> module.
- However, the encoding and decoding functions returned by this module
- are usually more low-level than is comfortable, and writing new encodings
- is a specialized task, so the module won’t be covered in this HOWTO.</p>
- </section>
- <section id="unicode-literals-in-python-source-code">
- <h3>Unicode Literals in Python Source Code<a class="headerlink" href="#unicode-literals-in-python-source-code" title="Permalink to this headline">¶</a></h3>
- <p>In Python source code, specific Unicode code points can be written using the
- <code class="docutils literal notranslate"><span class="pre">\u</span></code> escape sequence, which is followed by four hex digits giving the code
- point. The <code class="docutils literal notranslate"><span class="pre">\U</span></code> escape sequence is similar, but expects eight hex digits,
- not four:</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">s</span> <span class="o">=</span> <span class="s2">"a</span><span class="se">\xac\u1234\u20ac\U00008000</span><span class="s2">"</span>
- <span class="gp">... </span><span class="c1"># ^^^^ two-digit hex escape</span>
- <span class="gp">... </span><span class="c1"># ^^^^^^ four-digit Unicode escape</span>
- <span class="gp">... </span><span class="c1"># ^^^^^^^^^^ eight-digit Unicode escape</span>
- <span class="gp">>>> </span><span class="p">[</span><span class="nb">ord</span><span class="p">(</span><span class="n">c</span><span class="p">)</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">s</span><span class="p">]</span>
- <span class="go">[97, 172, 4660, 8364, 32768]</span>
- </pre></div>
- </div>
- <p>Using escape sequences for code points greater than 127 is fine in small doses,
- but becomes an annoyance if you’re using many accented characters, as you would
- in a program with messages in French or some other accent-using language. You
- can also assemble strings using the <a class="reference internal" href="../library/functions.html#chr" title="chr"><code class="xref py py-func docutils literal notranslate"><span class="pre">chr()</span></code></a> built-in function, but this is
- even more tedious.</p>
- <p>Ideally, you’d want to be able to write literals in your language’s natural
- encoding. You could then edit Python source code with your favorite editor
- which would display the accented characters naturally, and have the right
- characters used at runtime.</p>
- <p>Python supports writing source code in UTF-8 by default, but you can use almost
- any encoding if you declare the encoding being used. This is done by including
- a special comment as either the first or second line of the source file:</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="ch">#!/usr/bin/env python</span>
- <span class="c1"># -*- coding: latin-1 -*-</span>
- <span class="n">u</span> <span class="o">=</span> <span class="s1">'abcdé'</span>
- <span class="nb">print</span><span class="p">(</span><span class="nb">ord</span><span class="p">(</span><span class="n">u</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]))</span>
- </pre></div>
- </div>
- <p>The syntax is inspired by Emacs’s notation for specifying variables local to a
- file. Emacs supports many different variables, but Python only supports
- ‘coding’. The <code class="docutils literal notranslate"><span class="pre">-*-</span></code> symbols indicate to Emacs that the comment is special;
- they have no significance to Python but are a convention. Python looks for
- <code class="docutils literal notranslate"><span class="pre">coding:</span> <span class="pre">name</span></code> or <code class="docutils literal notranslate"><span class="pre">coding=name</span></code> in the comment.</p>
- <p>If you don’t include such a comment, the default encoding used will be UTF-8 as
- already mentioned. See also <span class="target" id="index-0"></span><a class="pep reference external" href="https://peps.python.org/pep-0263/"><strong>PEP 263</strong></a> for more information.</p>
- </section>
- <section id="unicode-properties">
- <h3>Unicode Properties<a class="headerlink" href="#unicode-properties" title="Permalink to this headline">¶</a></h3>
- <p>The Unicode specification includes a database of information about
- code points. For each defined code point, the information includes
- the character’s name, its category, the numeric value if applicable
- (for characters representing numeric concepts such as the Roman
- numerals, fractions such as one-third and four-fifths, etc.). There
- are also display-related properties, such as how to use the code point
- in bidirectional text.</p>
- <p>The following program displays some information about several characters, and
- prints the numeric value of one particular character:</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">unicodedata</span>
- <span class="n">u</span> <span class="o">=</span> <span class="nb">chr</span><span class="p">(</span><span class="mi">233</span><span class="p">)</span> <span class="o">+</span> <span class="nb">chr</span><span class="p">(</span><span class="mh">0x0bf2</span><span class="p">)</span> <span class="o">+</span> <span class="nb">chr</span><span class="p">(</span><span class="mi">3972</span><span class="p">)</span> <span class="o">+</span> <span class="nb">chr</span><span class="p">(</span><span class="mi">6000</span><span class="p">)</span> <span class="o">+</span> <span class="nb">chr</span><span class="p">(</span><span class="mi">13231</span><span class="p">)</span>
- <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">c</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">u</span><span class="p">):</span>
- <span class="nb">print</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="s1">'</span><span class="si">%04x</span><span class="s1">'</span> <span class="o">%</span> <span class="nb">ord</span><span class="p">(</span><span class="n">c</span><span class="p">),</span> <span class="n">unicodedata</span><span class="o">.</span><span class="n">category</span><span class="p">(</span><span class="n">c</span><span class="p">),</span> <span class="n">end</span><span class="o">=</span><span class="s2">" "</span><span class="p">)</span>
- <span class="nb">print</span><span class="p">(</span><span class="n">unicodedata</span><span class="o">.</span><span class="n">name</span><span class="p">(</span><span class="n">c</span><span class="p">))</span>
- <span class="c1"># Get numeric value of second character</span>
- <span class="nb">print</span><span class="p">(</span><span class="n">unicodedata</span><span class="o">.</span><span class="n">numeric</span><span class="p">(</span><span class="n">u</span><span class="p">[</span><span class="mi">1</span><span class="p">]))</span>
- </pre></div>
- </div>
- <p>When run, this prints:</p>
- <div class="highlight-none notranslate"><div class="highlight"><pre><span></span>0 00e9 Ll LATIN SMALL LETTER E WITH ACUTE
- 1 0bf2 No TAMIL NUMBER ONE THOUSAND
- 2 0f84 Mn TIBETAN MARK HALANTA
- 3 1770 Lo TAGBANWA LETTER SA
- 4 33af So SQUARE RAD OVER S SQUARED
- 1000.0
- </pre></div>
- </div>
- <p>The category codes are abbreviations describing the nature of the character.
- These are grouped into categories such as “Letter”, “Number”, “Punctuation”, or
- “Symbol”, which in turn are broken up into subcategories. To take the codes
- from the above output, <code class="docutils literal notranslate"><span class="pre">'Ll'</span></code> means ‘Letter, lowercase’, <code class="docutils literal notranslate"><span class="pre">'No'</span></code> means
- “Number, other”, <code class="docutils literal notranslate"><span class="pre">'Mn'</span></code> is “Mark, nonspacing”, and <code class="docutils literal notranslate"><span class="pre">'So'</span></code> is “Symbol,
- other”. See
- <a class="reference external" href="https://www.unicode.org/reports/tr44/#General_Category_Values">the General Category Values section of the Unicode Character Database documentation</a> for a
- list of category codes.</p>
- </section>
- <section id="comparing-strings">
- <h3>Comparing Strings<a class="headerlink" href="#comparing-strings" title="Permalink to this headline">¶</a></h3>
- <p>Unicode adds some complication to comparing strings, because the same
- set of characters can be represented by different sequences of code
- points. For example, a letter like ‘ê’ can be represented as a single
- code point U+00EA, or as U+0065 U+0302, which is the code point for
- ‘e’ followed by a code point for ‘COMBINING CIRCUMFLEX ACCENT’. These
- will produce the same output when printed, but one is a string of
- length 1 and the other is of length 2.</p>
- <p>One tool for a case-insensitive comparison is the
- <a class="reference internal" href="../library/stdtypes.html#str.casefold" title="str.casefold"><code class="xref py py-meth docutils literal notranslate"><span class="pre">casefold()</span></code></a> string method that converts a string to a
- case-insensitive form following an algorithm described by the Unicode
- Standard. This algorithm has special handling for characters such as
- the German letter ‘ß’ (code point U+00DF), which becomes the pair of
- lowercase letters ‘ss’.</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="gp">>>> </span><span class="n">street</span> <span class="o">=</span> <span class="s1">'Gürzenichstraße'</span>
- <span class="gp">>>> </span><span class="n">street</span><span class="o">.</span><span class="n">casefold</span><span class="p">()</span>
- <span class="go">'gürzenichstrasse'</span>
- </pre></div>
- </div>
- <p>A second tool is the <a class="reference internal" href="../library/unicodedata.html#module-unicodedata" title="unicodedata: Access the Unicode Database."><code class="xref py py-mod docutils literal notranslate"><span class="pre">unicodedata</span></code></a> module’s
- <a class="reference internal" href="../library/unicodedata.html#unicodedata.normalize" title="unicodedata.normalize"><code class="xref py py-func docutils literal notranslate"><span class="pre">normalize()</span></code></a> function that converts strings to one
- of several normal forms, where letters followed by a combining character are
- replaced with single characters. <a class="reference internal" href="../library/unicodedata.html#unicodedata.normalize" title="unicodedata.normalize"><code class="xref py py-func docutils literal notranslate"><span class="pre">normalize()</span></code></a> can
- be used to perform string comparisons that won’t falsely report
- inequality if two strings use combining characters differently:</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">unicodedata</span>
- <span class="k">def</span> <span class="nf">compare_strs</span><span class="p">(</span><span class="n">s1</span><span class="p">,</span> <span class="n">s2</span><span class="p">):</span>
- <span class="k">def</span> <span class="nf">NFD</span><span class="p">(</span><span class="n">s</span><span class="p">):</span>
- <span class="k">return</span> <span class="n">unicodedata</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="s1">'NFD'</span><span class="p">,</span> <span class="n">s</span><span class="p">)</span>
- <span class="k">return</span> <span class="n">NFD</span><span class="p">(</span><span class="n">s1</span><span class="p">)</span> <span class="o">==</span> <span class="n">NFD</span><span class="p">(</span><span class="n">s2</span><span class="p">)</span>
- <span class="n">single_char</span> <span class="o">=</span> <span class="s1">'ê'</span>
- <span class="n">multiple_chars</span> <span class="o">=</span> <span class="s1">'</span><span class="se">\N{LATIN SMALL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}</span><span class="s1">'</span>
- <span class="nb">print</span><span class="p">(</span><span class="s1">'length of first string='</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">single_char</span><span class="p">))</span>
- <span class="nb">print</span><span class="p">(</span><span class="s1">'length of second string='</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">multiple_chars</span><span class="p">))</span>
- <span class="nb">print</span><span class="p">(</span><span class="n">compare_strs</span><span class="p">(</span><span class="n">single_char</span><span class="p">,</span> <span class="n">multiple_chars</span><span class="p">))</span>
- </pre></div>
- </div>
- <p>When run, this outputs:</p>
- <div class="highlight-shell-session notranslate"><div class="highlight"><pre><span></span><span class="gp">$ </span>python<span class="w"> </span>compare-strs.py
- <span class="go">length of first string= 1</span>
- <span class="go">length of second string= 2</span>
- <span class="go">True</span>
- </pre></div>
- </div>
- <p>The first argument to the <a class="reference internal" href="../library/unicodedata.html#unicodedata.normalize" title="unicodedata.normalize"><code class="xref py py-func docutils literal notranslate"><span class="pre">normalize()</span></code></a> function is a
- string giving the desired normalization form, which can be one of
- ‘NFC’, ‘NFKC’, ‘NFD’, and ‘NFKD’.</p>
- <p>The Unicode Standard also specifies how to do caseless comparisons:</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">unicodedata</span>
- <span class="k">def</span> <span class="nf">compare_caseless</span><span class="p">(</span><span class="n">s1</span><span class="p">,</span> <span class="n">s2</span><span class="p">):</span>
- <span class="k">def</span> <span class="nf">NFD</span><span class="p">(</span><span class="n">s</span><span class="p">):</span>
- <span class="k">return</span> <span class="n">unicodedata</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="s1">'NFD'</span><span class="p">,</span> <span class="n">s</span><span class="p">)</span>
- <span class="k">return</span> <span class="n">NFD</span><span class="p">(</span><span class="n">NFD</span><span class="p">(</span><span class="n">s1</span><span class="p">)</span><span class="o">.</span><span class="n">casefold</span><span class="p">())</span> <span class="o">==</span> <span class="n">NFD</span><span class="p">(</span><span class="n">NFD</span><span class="p">(</span><span class="n">s2</span><span class="p">)</span><span class="o">.</span><span class="n">casefold</span><span class="p">())</span>
- <span class="c1"># Example usage</span>
- <span class="n">single_char</span> <span class="o">=</span> <span class="s1">'ê'</span>
- <span class="n">multiple_chars</span> <span class="o">=</span> <span class="s1">'</span><span class="se">\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}</span><span class="s1">'</span>
- <span class="nb">print</span><span class="p">(</span><span class="n">compare_caseless</span><span class="p">(</span><span class="n">single_char</span><span class="p">,</span> <span class="n">multiple_chars</span><span class="p">))</span>
- </pre></div>
- </div>
- <p>This will print <code class="docutils literal notranslate"><span class="pre">True</span></code>. (Why is <code class="xref py py-func docutils literal notranslate"><span class="pre">NFD()</span></code> invoked twice? Because
- there are a few characters that make <a class="reference internal" href="../library/stdtypes.html#str.casefold" title="str.casefold"><code class="xref py py-meth docutils literal notranslate"><span class="pre">casefold()</span></code></a> return a
- non-normalized string, so the result needs to be normalized again. See
- section 3.13 of the Unicode Standard for a discussion and an example.)</p>
- </section>
- <section id="unicode-regular-expressions">
- <h3>Unicode Regular Expressions<a class="headerlink" href="#unicode-regular-expressions" title="Permalink to this headline">¶</a></h3>
- <p>The regular expressions supported by the <a class="reference internal" href="../library/re.html#module-re" title="re: Regular expression operations."><code class="xref py py-mod docutils literal notranslate"><span class="pre">re</span></code></a> module can be provided
- either as bytes or strings. Some of the special character sequences such as
- <code class="docutils literal notranslate"><span class="pre">\d</span></code> and <code class="docutils literal notranslate"><span class="pre">\w</span></code> have different meanings depending on whether
- the pattern is supplied as bytes or a string. For example,
- <code class="docutils literal notranslate"><span class="pre">\d</span></code> will match the characters <code class="docutils literal notranslate"><span class="pre">[0-9]</span></code> in bytes but
- in strings will match any character that’s in the <code class="docutils literal notranslate"><span class="pre">'Nd'</span></code> category.</p>
- <p>The string in this example has the number 57 written in both Thai and
- Arabic numerals:</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">re</span>
- <span class="n">p</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="sa">r</span><span class="s1">'\d+'</span><span class="p">)</span>
- <span class="n">s</span> <span class="o">=</span> <span class="s2">"Over </span><span class="se">\u0e55\u0e57</span><span class="s2"> 57 flavours"</span>
- <span class="n">m</span> <span class="o">=</span> <span class="n">p</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
- <span class="nb">print</span><span class="p">(</span><span class="nb">repr</span><span class="p">(</span><span class="n">m</span><span class="o">.</span><span class="n">group</span><span class="p">()))</span>
- </pre></div>
- </div>
- <p>When executed, <code class="docutils literal notranslate"><span class="pre">\d+</span></code> will match the Thai numerals and print them
- out. If you supply the <a class="reference internal" href="../library/re.html#re.ASCII" title="re.ASCII"><code class="xref py py-const docutils literal notranslate"><span class="pre">re.ASCII</span></code></a> flag to
- <a class="reference internal" href="../library/re.html#re.compile" title="re.compile"><code class="xref py py-func docutils literal notranslate"><span class="pre">compile()</span></code></a>, <code class="docutils literal notranslate"><span class="pre">\d+</span></code> will match the substring “57” instead.</p>
- <p>Similarly, <code class="docutils literal notranslate"><span class="pre">\w</span></code> matches a wide variety of Unicode characters but
- only <code class="docutils literal notranslate"><span class="pre">[a-zA-Z0-9_]</span></code> in bytes or if <a class="reference internal" href="../library/re.html#re.ASCII" title="re.ASCII"><code class="xref py py-const docutils literal notranslate"><span class="pre">re.ASCII</span></code></a> is supplied,
- and <code class="docutils literal notranslate"><span class="pre">\s</span></code> will match either Unicode whitespace characters or
- <code class="docutils literal notranslate"><span class="pre">[</span> <span class="pre">\t\n\r\f\v]</span></code>.</p>
- </section>
- <section id="id2">
- <h3>References<a class="headerlink" href="#id2" title="Permalink to this headline">¶</a></h3>
- <p>Some good alternative discussions of Python’s Unicode support are:</p>
- <ul class="simple">
- <li><p><a class="reference external" href="https://python-notes.curiousefficiency.org/en/latest/python3/text_file_processing.html">Processing Text Files in Python 3</a>, by Nick Coghlan.</p></li>
- <li><p><a class="reference external" href="https://nedbatchelder.com/text/unipain.html">Pragmatic Unicode</a>, a PyCon 2012 presentation by Ned Batchelder.</p></li>
- </ul>
- <p>The <a class="reference internal" href="../library/stdtypes.html#str" title="str"><code class="xref py py-class docutils literal notranslate"><span class="pre">str</span></code></a> type is described in the Python library reference at
- <a class="reference internal" href="../library/stdtypes.html#textseq"><span class="std std-ref">Text Sequence Type — str</span></a>.</p>
- <p>The documentation for the <a class="reference internal" href="../library/unicodedata.html#module-unicodedata" title="unicodedata: Access the Unicode Database."><code class="xref py py-mod docutils literal notranslate"><span class="pre">unicodedata</span></code></a> module.</p>
- <p>The documentation for the <a class="reference internal" href="../library/codecs.html#module-codecs" title="codecs: Encode and decode data and streams."><code class="xref py py-mod docutils literal notranslate"><span class="pre">codecs</span></code></a> module.</p>
- <p>Marc-André Lemburg gave <a class="reference external" href="https://downloads.egenix.com/python/Unicode-EPC2002-Talk.pdf">a presentation titled “Python and Unicode” (PDF slides)</a> at
- EuroPython 2002. The slides are an excellent overview of the design of Python
- 2’s Unicode features (where the Unicode string type is called <code class="docutils literal notranslate"><span class="pre">unicode</span></code> and
- literals start with <code class="docutils literal notranslate"><span class="pre">u</span></code>).</p>
- </section>
- </section>
- <section id="reading-and-writing-unicode-data">
- <h2>Reading and Writing Unicode Data<a class="headerlink" href="#reading-and-writing-unicode-data" title="Permalink to this headline">¶</a></h2>
- <p>Once you’ve written some code that works with Unicode data, the next problem is
- input/output. How do you get Unicode strings into your program, and how do you
- convert Unicode into a form suitable for storage or transmission?</p>
- <p>It’s possible that you may not need to do anything depending on your input
- sources and output destinations; you should check whether the libraries used in
- your application support Unicode natively. XML parsers often return Unicode
- data, for example. Many relational databases also support Unicode-valued
- columns and can return Unicode values from an SQL query.</p>
- <p>Unicode data is usually converted to a particular encoding before it gets
- written to disk or sent over a socket. It’s possible to do all the work
- yourself: open a file, read an 8-bit bytes object from it, and convert the bytes
- with <code class="docutils literal notranslate"><span class="pre">bytes.decode(encoding)</span></code>. However, the manual approach is not recommended.</p>
- <p>One problem is the multi-byte nature of encodings; one Unicode character can be
- represented by several bytes. If you want to read the file in arbitrary-sized
- chunks (say, 1024 or 4096 bytes), you need to write error-handling code to catch the case
- where only part of the bytes encoding a single Unicode character are read at the
- end of a chunk. One solution would be to read the entire file into memory and
- then perform the decoding, but that prevents you from working with files that
- are extremely large; if you need to read a 2 GiB file, you need 2 GiB of RAM.
- (More, really, since for at least a moment you’d need to have both the encoded
- string and its Unicode version in memory.)</p>
- <p>The solution would be to use the low-level decoding interface to catch the case
- of partial coding sequences. The work of implementing this has already been
- done for you: the built-in <a class="reference internal" href="../library/functions.html#open" title="open"><code class="xref py py-func docutils literal notranslate"><span class="pre">open()</span></code></a> function can return a file-like object
- that assumes the file’s contents are in a specified encoding and accepts Unicode
- parameters for methods such as <a class="reference internal" href="../library/io.html#io.TextIOBase.read" title="io.TextIOBase.read"><code class="xref py py-meth docutils literal notranslate"><span class="pre">read()</span></code></a> and
- <a class="reference internal" href="../library/io.html#io.TextIOBase.write" title="io.TextIOBase.write"><code class="xref py py-meth docutils literal notranslate"><span class="pre">write()</span></code></a>. This works through <a class="reference internal" href="../library/functions.html#open" title="open"><code class="xref py py-func docutils literal notranslate"><span class="pre">open()</span></code></a>'s <em>encoding</em> and
- <em>errors</em> parameters which are interpreted just like those in <a class="reference internal" href="../library/stdtypes.html#str.encode" title="str.encode"><code class="xref py py-meth docutils literal notranslate"><span class="pre">str.encode()</span></code></a>
- and <a class="reference internal" href="../library/stdtypes.html#bytes.decode" title="bytes.decode"><code class="xref py py-meth docutils literal notranslate"><span class="pre">bytes.decode()</span></code></a>.</p>
- <p>Reading Unicode from a file is therefore simple:</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="s1">'unicode.txt'</span><span class="p">,</span> <span class="n">encoding</span><span class="o">=</span><span class="s1">'utf-8'</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
- <span class="k">for</span> <span class="n">line</span> <span class="ow">in</span> <span class="n">f</span><span class="p">:</span>
- <span class="nb">print</span><span class="p">(</span><span class="nb">repr</span><span class="p">(</span><span class="n">line</span><span class="p">))</span>
- </pre></div>
- </div>
- <p>It’s also possible to open files in update mode, allowing both reading and
- writing:</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="s1">'test'</span><span class="p">,</span> <span class="n">encoding</span><span class="o">=</span><span class="s1">'utf-8'</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s1">'w+'</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
- <span class="n">f</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s1">'</span><span class="se">\u4500</span><span class="s1"> blah blah blah</span><span class="se">\n</span><span class="s1">'</span><span class="p">)</span>
- <span class="n">f</span><span class="o">.</span><span class="n">seek</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
- <span class="nb">print</span><span class="p">(</span><span class="nb">repr</span><span class="p">(</span><span class="n">f</span><span class="o">.</span><span class="n">readline</span><span class="p">()[:</span><span class="mi">1</span><span class="p">]))</span>
- </pre></div>
- </div>
- <p>The Unicode character <code class="docutils literal notranslate"><span class="pre">U+FEFF</span></code> is used as a byte-order mark (BOM), and is often
- written as the first character of a file in order to assist with autodetection
- of the file’s byte ordering. Some encodings, such as UTF-16, expect a BOM to be
- present at the start of a file; when such an encoding is used, the BOM will be
- automatically written as the first character and will be silently dropped when
- the file is read. There are variants of these encodings, such as ‘utf-16-le’
- and ‘utf-16-be’ for little-endian and big-endian encodings, that specify one
- particular byte ordering and don’t skip the BOM.</p>
- <p>In some areas, it is also convention to use a “BOM” at the start of UTF-8
- encoded files; the name is misleading since UTF-8 is not byte-order dependent.
- The mark simply announces that the file is encoded in UTF-8. For reading such
- files, use the ‘utf-8-sig’ codec to automatically skip the mark if present.</p>
- <section id="unicode-filenames">
- <h3>Unicode filenames<a class="headerlink" href="#unicode-filenames" title="Permalink to this headline">¶</a></h3>
- <p>Most of the operating systems in common use today support filenames
- that contain arbitrary Unicode characters. Usually this is
- implemented by converting the Unicode string into some encoding that
- varies depending on the system. Today Python is converging on using
- UTF-8: Python on MacOS has used UTF-8 for several versions, and Python
- 3.6 switched to using UTF-8 on Windows as well. On Unix systems,
- there will only be a <a class="reference internal" href="../glossary.html#term-filesystem-encoding-and-error-handler"><span class="xref std std-term">filesystem encoding</span></a>. if you’ve set the <code class="docutils literal notranslate"><span class="pre">LANG</span></code> or <code class="docutils literal notranslate"><span class="pre">LC_CTYPE</span></code> environment variables; if
- you haven’t, the default encoding is again UTF-8.</p>
- <p>The <a class="reference internal" href="../library/sys.html#sys.getfilesystemencoding" title="sys.getfilesystemencoding"><code class="xref py py-func docutils literal notranslate"><span class="pre">sys.getfilesystemencoding()</span></code></a> function returns the encoding to use on
- your current system, in case you want to do the encoding manually, but there’s
- not much reason to bother. When opening a file for reading or writing, you can
- usually just provide the Unicode string as the filename, and it will be
- automatically converted to the right encoding for you:</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="n">filename</span> <span class="o">=</span> <span class="s1">'filename</span><span class="se">\u4500</span><span class="s1">abc'</span>
- <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">filename</span><span class="p">,</span> <span class="s1">'w'</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
- <span class="n">f</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s1">'blah</span><span class="se">\n</span><span class="s1">'</span><span class="p">)</span>
- </pre></div>
- </div>
- <p>Functions in the <a class="reference internal" href="../library/os.html#module-os" title="os: Miscellaneous operating system interfaces."><code class="xref py py-mod docutils literal notranslate"><span class="pre">os</span></code></a> module such as <a class="reference internal" href="../library/os.html#os.stat" title="os.stat"><code class="xref py py-func docutils literal notranslate"><span class="pre">os.stat()</span></code></a> will also accept Unicode
- filenames.</p>
- <p>The <a class="reference internal" href="../library/os.html#os.listdir" title="os.listdir"><code class="xref py py-func docutils literal notranslate"><span class="pre">os.listdir()</span></code></a> function returns filenames, which raises an issue: should it return
- the Unicode version of filenames, or should it return bytes containing
- the encoded versions? <a class="reference internal" href="../library/os.html#os.listdir" title="os.listdir"><code class="xref py py-func docutils literal notranslate"><span class="pre">os.listdir()</span></code></a> can do both, depending on whether you
- provided the directory path as bytes or a Unicode string. If you pass a
- Unicode string as the path, filenames will be decoded using the filesystem’s
- encoding and a list of Unicode strings will be returned, while passing a byte
- path will return the filenames as bytes. For example,
- assuming the default <a class="reference internal" href="../glossary.html#term-filesystem-encoding-and-error-handler"><span class="xref std std-term">filesystem encoding</span></a> is UTF-8, running the following program:</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="n">fn</span> <span class="o">=</span> <span class="s1">'filename</span><span class="se">\u4500</span><span class="s1">abc'</span>
- <span class="n">f</span> <span class="o">=</span> <span class="nb">open</span><span class="p">(</span><span class="n">fn</span><span class="p">,</span> <span class="s1">'w'</span><span class="p">)</span>
- <span class="n">f</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
- <span class="kn">import</span> <span class="nn">os</span>
- <span class="nb">print</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">listdir</span><span class="p">(</span><span class="sa">b</span><span class="s1">'.'</span><span class="p">))</span>
- <span class="nb">print</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">listdir</span><span class="p">(</span><span class="s1">'.'</span><span class="p">))</span>
- </pre></div>
- </div>
- <p>will produce the following output:</p>
- <div class="highlight-shell-session notranslate"><div class="highlight"><pre><span></span><span class="gp">$ </span>python<span class="w"> </span>listdir-test.py
- <span class="go">[b'filename\xe4\x94\x80abc', ...]</span>
- <span class="go">['filename\u4500abc', ...]</span>
- </pre></div>
- </div>
- <p>The first list contains UTF-8-encoded filenames, and the second list contains
- the Unicode versions.</p>
- <p>Note that on most occasions, you should can just stick with using
- Unicode with these APIs. The bytes APIs should only be used on
- systems where undecodable file names can be present; that’s
- pretty much only Unix systems now.</p>
- </section>
- <section id="tips-for-writing-unicode-aware-programs">
- <h3>Tips for Writing Unicode-aware Programs<a class="headerlink" href="#tips-for-writing-unicode-aware-programs" title="Permalink to this headline">¶</a></h3>
- <p>This section provides some suggestions on writing software that deals with
- Unicode.</p>
- <p>The most important tip is:</p>
- <blockquote>
- <div><p>Software should only work with Unicode strings internally, decoding the input
- data as soon as possible and encoding the output only at the end.</p>
- </div></blockquote>
- <p>If you attempt to write processing functions that accept both Unicode and byte
- strings, you will find your program vulnerable to bugs wherever you combine the
- two different kinds of strings. There is no automatic encoding or decoding: if
- you do e.g. <code class="docutils literal notranslate"><span class="pre">str</span> <span class="pre">+</span> <span class="pre">bytes</span></code>, a <a class="reference internal" href="../library/exceptions.html#TypeError" title="TypeError"><code class="xref py py-exc docutils literal notranslate"><span class="pre">TypeError</span></code></a> will be raised.</p>
- <p>When using data coming from a web browser or some other untrusted source, a
- common technique is to check for illegal characters in a string before using the
- string in a generated command line or storing it in a database. If you’re doing
- this, be careful to check the decoded string, not the encoded bytes data;
- some encodings may have interesting properties, such as not being bijective
- or not being fully ASCII-compatible. This is especially true if the input
- data also specifies the encoding, since the attacker can then choose a
- clever way to hide malicious text in the encoded bytestream.</p>
- <section id="converting-between-file-encodings">
- <h4>Converting Between File Encodings<a class="headerlink" href="#converting-between-file-encodings" title="Permalink to this headline">¶</a></h4>
- <p>The <a class="reference internal" href="../library/codecs.html#codecs.StreamRecoder" title="codecs.StreamRecoder"><code class="xref py py-class docutils literal notranslate"><span class="pre">StreamRecoder</span></code></a> class can transparently convert between
- encodings, taking a stream that returns data in encoding #1
- and behaving like a stream returning data in encoding #2.</p>
- <p>For example, if you have an input file <em>f</em> that’s in Latin-1, you
- can wrap it with a <a class="reference internal" href="../library/codecs.html#codecs.StreamRecoder" title="codecs.StreamRecoder"><code class="xref py py-class docutils literal notranslate"><span class="pre">StreamRecoder</span></code></a> to return bytes encoded in
- UTF-8:</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="n">new_f</span> <span class="o">=</span> <span class="n">codecs</span><span class="o">.</span><span class="n">StreamRecoder</span><span class="p">(</span><span class="n">f</span><span class="p">,</span>
- <span class="c1"># en/decoder: used by read() to encode its results and</span>
- <span class="c1"># by write() to decode its input.</span>
- <span class="n">codecs</span><span class="o">.</span><span class="n">getencoder</span><span class="p">(</span><span class="s1">'utf-8'</span><span class="p">),</span> <span class="n">codecs</span><span class="o">.</span><span class="n">getdecoder</span><span class="p">(</span><span class="s1">'utf-8'</span><span class="p">),</span>
- <span class="c1"># reader/writer: used to read and write to the stream.</span>
- <span class="n">codecs</span><span class="o">.</span><span class="n">getreader</span><span class="p">(</span><span class="s1">'latin-1'</span><span class="p">),</span> <span class="n">codecs</span><span class="o">.</span><span class="n">getwriter</span><span class="p">(</span><span class="s1">'latin-1'</span><span class="p">)</span> <span class="p">)</span>
- </pre></div>
- </div>
- </section>
- <section id="files-in-an-unknown-encoding">
- <h4>Files in an Unknown Encoding<a class="headerlink" href="#files-in-an-unknown-encoding" title="Permalink to this headline">¶</a></h4>
- <p>What can you do if you need to make a change to a file, but don’t know
- the file’s encoding? If you know the encoding is ASCII-compatible and
- only want to examine or modify the ASCII parts, you can open the file
- with the <code class="docutils literal notranslate"><span class="pre">surrogateescape</span></code> error handler:</p>
- <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">fname</span><span class="p">,</span> <span class="s1">'r'</span><span class="p">,</span> <span class="n">encoding</span><span class="o">=</span><span class="s2">"ascii"</span><span class="p">,</span> <span class="n">errors</span><span class="o">=</span><span class="s2">"surrogateescape"</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
- <span class="n">data</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">read</span><span class="p">()</span>
- <span class="c1"># make changes to the string 'data'</span>
- <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">fname</span> <span class="o">+</span> <span class="s1">'.new'</span><span class="p">,</span> <span class="s1">'w'</span><span class="p">,</span>
- <span class="n">encoding</span><span class="o">=</span><span class="s2">"ascii"</span><span class="p">,</span> <span class="n">errors</span><span class="o">=</span><span class="s2">"surrogateescape"</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
- <span class="n">f</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
- </pre></div>
- </div>
- <p>The <code class="docutils literal notranslate"><span class="pre">surrogateescape</span></code> error handler will decode any non-ASCII bytes
- as code points in a special range running from U+DC80 to
- U+DCFF. These code points will then turn back into the
- same bytes when the <code class="docutils literal notranslate"><span class="pre">surrogateescape</span></code> error handler is used to
- encode the data and write it back out.</p>
- </section>
- </section>
- <section id="id3">
- <h3>References<a class="headerlink" href="#id3" title="Permalink to this headline">¶</a></h3>
- <p>One section of <a class="reference external" href="https://pyvideo.org/video/289/pycon-2010--mastering-python-3-i-o">Mastering Python 3 Input/Output</a>,
- a PyCon 2010 talk by David Beazley, discusses text processing and binary data handling.</p>
- <p>The <a class="reference external" href="https://downloads.egenix.com/python/LSM2005-Developing-Unicode-aware-applications-in-Python.pdf">PDF slides for Marc-André Lemburg’s presentation “Writing Unicode-aware
- Applications in Python”</a>
- discuss questions of character encodings as well as how to internationalize
- and localize an application. These slides cover Python 2.x only.</p>
- <p><a class="reference external" href="https://pyvideo.org/video/1768/the-guts-of-unicode-in-python">The Guts of Unicode in Python</a>
- is a PyCon 2013 talk by Benjamin Peterson that discusses the internal Unicode
- representation in Python 3.3.</p>
- </section>
- </section>
- <section id="acknowledgements">
- <h2>Acknowledgements<a class="headerlink" href="#acknowledgements" title="Permalink to this headline">¶</a></h2>
- <p>The initial draft of this document was written by Andrew Kuchling.
- It has since been revised further by Alexander Belopolsky, Georg Brandl,
- Andrew Kuchling, and Ezio Melotti.</p>
- <p>Thanks to the following people who have noted errors or offered
- suggestions on this article: Éric Araujo, Nicholas Bastin, Nick
- Coghlan, Marius Gedminas, Kent Johnson, Ken Krugler, Marc-André
- Lemburg, Martin von Löwis, Terry J. Reedy, Serhiy Storchaka,
- Eryk Sun, Chad Whitacre, Graham Wideman.</p>
- </section>
- </section>
- <div class="clearer"></div>
- </div>
- </div>
- </div>
- <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
- <div class="sphinxsidebarwrapper">
- <div>
- <h3><a href="../contents.html">Table of Contents</a></h3>
- <ul>
- <li><a class="reference internal" href="#">Unicode HOWTO</a><ul>
- <li><a class="reference internal" href="#introduction-to-unicode">Introduction to Unicode</a><ul>
- <li><a class="reference internal" href="#definitions">Definitions</a></li>
- <li><a class="reference internal" href="#encodings">Encodings</a></li>
- <li><a class="reference internal" href="#references">References</a></li>
- </ul>
- </li>
- <li><a class="reference internal" href="#python-s-unicode-support">Python’s Unicode Support</a><ul>
- <li><a class="reference internal" href="#the-string-type">The String Type</a></li>
- <li><a class="reference internal" href="#converting-to-bytes">Converting to Bytes</a></li>
- <li><a class="reference internal" href="#unicode-literals-in-python-source-code">Unicode Literals in Python Source Code</a></li>
- <li><a class="reference internal" href="#unicode-properties">Unicode Properties</a></li>
- <li><a class="reference internal" href="#comparing-strings">Comparing Strings</a></li>
- <li><a class="reference internal" href="#unicode-regular-expressions">Unicode Regular Expressions</a></li>
- <li><a class="reference internal" href="#id2">References</a></li>
- </ul>
- </li>
- <li><a class="reference internal" href="#reading-and-writing-unicode-data">Reading and Writing Unicode Data</a><ul>
- <li><a class="reference internal" href="#unicode-filenames">Unicode filenames</a></li>
- <li><a class="reference internal" href="#tips-for-writing-unicode-aware-programs">Tips for Writing Unicode-aware Programs</a><ul>
- <li><a class="reference internal" href="#converting-between-file-encodings">Converting Between File Encodings</a></li>
- <li><a class="reference internal" href="#files-in-an-unknown-encoding">Files in an Unknown Encoding</a></li>
- </ul>
- </li>
- <li><a class="reference internal" href="#id3">References</a></li>
- </ul>
- </li>
- <li><a class="reference internal" href="#acknowledgements">Acknowledgements</a></li>
- </ul>
- </li>
- </ul>
- </div>
- <div>
- <h4>Previous topic</h4>
- <p class="topless"><a href="sorting.html"
- title="previous chapter">Sorting HOW TO</a></p>
- </div>
- <div>
- <h4>Next topic</h4>
- <p class="topless"><a href="urllib2.html"
- title="next chapter">HOWTO Fetch Internet Resources Using The urllib Package</a></p>
- </div>
- <div role="note" aria-label="source link">
- <h3>This Page</h3>
- <ul class="this-page-menu">
- <li><a href="../bugs.html">Report a Bug</a></li>
- <li>
- <a href="https://github.com/python/cpython/blob/main/Doc/howto/unicode.rst"
- rel="nofollow">Show Source
- </a>
- </li>
- </ul>
- </div>
- </div>
- </div>
- <div class="clearer"></div>
- </div>
- <div class="related" role="navigation" aria-label="related navigation">
- <h3>Navigation</h3>
- <ul>
- <li class="right" style="margin-right: 10px">
- <a href="../genindex.html" title="General Index"
- >index</a></li>
- <li class="right" >
- <a href="../py-modindex.html" title="Python Module Index"
- >modules</a> |</li>
- <li class="right" >
- <a href="urllib2.html" title="HOWTO Fetch Internet Resources Using The urllib Package"
- >next</a> |</li>
- <li class="right" >
- <a href="sorting.html" title="Sorting HOW TO"
- >previous</a> |</li>
- <li><img src="../_static/py.svg" alt="python logo" style="vertical-align: middle; margin-top: -1px"/></li>
- <li><a href="https://www.python.org/">Python</a> »</li>
- <li class="switchers">
- <div class="language_switcher_placeholder"></div>
- <div class="version_switcher_placeholder"></div>
- </li>
- <li>
-
- </li>
- <li id="cpython-language-and-version">
- <a href="../index.html">3.12.0 Documentation</a> »
- </li>
- <li class="nav-item nav-item-1"><a href="index.html" >Python HOWTOs</a> »</li>
- <li class="nav-item nav-item-this"><a href="">Unicode HOWTO</a></li>
- <li class="right">
-
- <div class="inline-search" role="search">
- <form class="inline-search" action="../search.html" method="get">
- <input placeholder="Quick search" aria-label="Quick search" type="search" name="q" />
- <input type="submit" value="Go" />
- </form>
- </div>
- |
- </li>
- <li class="right">
- <label class="theme-selector-label">
- Theme
- <select class="theme-selector" oninput="activateTheme(this.value)">
- <option value="auto" selected>Auto</option>
- <option value="light">Light</option>
- <option value="dark">Dark</option>
- </select>
- </label> |</li>
-
- </ul>
- </div>
- <div class="footer">
- © <a href="../copyright.html">Copyright</a> 2001-2023, Python Software Foundation.
- <br />
- This page is licensed under the Python Software Foundation License Version 2.
- <br />
- Examples, recipes, and other code in the documentation are additionally licensed under the Zero Clause BSD License.
- <br />
- See <a href="/license.html">History and License</a> for more information.<br />
- <br />
- The Python Software Foundation is a non-profit corporation.
- <a href="https://www.python.org/psf/donations/">Please donate.</a>
- <br />
- <br />
- Last updated on Oct 02, 2023.
- <a href="/bugs.html">Found a bug</a>?
- <br />
- Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0.
- </div>
- </body>
- </html>
|