unicode.html 82 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029
  1. <!DOCTYPE html>
  2. <html>
  3. <head>
  4. <meta charset="utf-8" />
  5. <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" />
  6. <meta property="og:title" content="Unicode HOWTO" />
  7. <meta property="og:type" content="website" />
  8. <meta property="og:url" content="https://docs.python.org/3/howto/unicode.html" />
  9. <meta property="og:site_name" content="Python documentation" />
  10. <meta property="og:description" content="Release, 1.12,. This HOWTO discusses Python’s support for the Unicode specification for representing textual data, and explains various problems that people commonly encounter when trying to work w..." />
  11. <meta property="og:image" content="https://docs.python.org/3/_static/og-image.png" />
  12. <meta property="og:image:alt" content="Python documentation" />
  13. <meta name="description" content="Release, 1.12,. This HOWTO discusses Python’s support for the Unicode specification for representing textual data, and explains various problems that people commonly encounter when trying to work w..." />
  14. <meta property="og:image:width" content="200" />
  15. <meta property="og:image:height" content="200" />
  16. <meta name="theme-color" content="#3776ab" />
  17. <title>Unicode HOWTO &#8212; Python 3.12.0 documentation</title><meta name="viewport" content="width=device-width, initial-scale=1.0">
  18. <link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
  19. <link rel="stylesheet" type="text/css" href="../_static/pydoctheme.css?digest=b37c26da2f7529d09fe70b41c4b2133fe4931a90" />
  20. <link id="pygments_dark_css" media="(prefers-color-scheme: dark)" rel="stylesheet" type="text/css" href="../_static/pygments_dark.css" />
  21. <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
  22. <script src="../_static/jquery.js"></script>
  23. <script src="../_static/underscore.js"></script>
  24. <script src="../_static/doctools.js"></script>
  25. <script src="../_static/sidebar.js"></script>
  26. <link rel="search" type="application/opensearchdescription+xml"
  27. title="Search within Python 3.12.0 documentation"
  28. href="../_static/opensearch.xml"/>
  29. <link rel="author" title="About these documents" href="../about.html" />
  30. <link rel="index" title="Index" href="../genindex.html" />
  31. <link rel="search" title="Search" href="../search.html" />
  32. <link rel="copyright" title="Copyright" href="../copyright.html" />
  33. <link rel="next" title="HOWTO Fetch Internet Resources Using The urllib Package" href="urllib2.html" />
  34. <link rel="prev" title="Sorting HOW TO" href="sorting.html" />
  35. <link rel="canonical" href="https://docs.python.org/3/howto/unicode.html" />
  36. <style>
  37. @media only screen {
  38. table.full-width-table {
  39. width: 100%;
  40. }
  41. }
  42. </style>
  43. <link rel="stylesheet" href="../_static/pydoctheme_dark.css" media="(prefers-color-scheme: dark)" id="pydoctheme_dark_css">
  44. <link rel="shortcut icon" type="image/png" href="../_static/py.svg" />
  45. <script type="text/javascript" src="../_static/copybutton.js"></script>
  46. <script type="text/javascript" src="../_static/menu.js"></script>
  47. <script type="text/javascript" src="../_static/themetoggle.js"></script>
  48. </head>
  49. <body>
  50. <div class="mobile-nav">
  51. <input type="checkbox" id="menuToggler" class="toggler__input" aria-controls="navigation"
  52. aria-pressed="false" aria-expanded="false" role="button" aria-label="Menu" />
  53. <nav class="nav-content" role="navigation">
  54. <label for="menuToggler" class="toggler__label">
  55. <span></span>
  56. </label>
  57. <span class="nav-items-wrapper">
  58. <a href="https://www.python.org/" class="nav-logo">
  59. <img src="../_static/py.svg" alt="Logo"/>
  60. </a>
  61. <span class="version_switcher_placeholder"></span>
  62. <form role="search" class="search" action="../search.html" method="get">
  63. <svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" class="search-icon">
  64. <path fill-rule="nonzero" fill="currentColor" d="M15.5 14h-.79l-.28-.27a6.5 6.5 0 001.48-5.34c-.47-2.78-2.79-5-5.59-5.34a6.505 6.505 0 00-7.27 7.27c.34 2.8 2.56 5.12 5.34 5.59a6.5 6.5 0 005.34-1.48l.27.28v.79l4.25 4.25c.41.41 1.08.41 1.49 0 .41-.41.41-1.08 0-1.49L15.5 14zm-6 0C7.01 14 5 11.99 5 9.5S7.01 5 9.5 5 14 7.01 14 9.5 11.99 14 9.5 14z"></path>
  65. </svg>
  66. <input placeholder="Quick search" aria-label="Quick search" type="search" name="q" />
  67. <input type="submit" value="Go"/>
  68. </form>
  69. </span>
  70. </nav>
  71. <div class="menu-wrapper">
  72. <nav class="menu" role="navigation" aria-label="main navigation">
  73. <div class="language_switcher_placeholder"></div>
  74. <label class="theme-selector-label">
  75. Theme
  76. <select class="theme-selector" oninput="activateTheme(this.value)">
  77. <option value="auto" selected>Auto</option>
  78. <option value="light">Light</option>
  79. <option value="dark">Dark</option>
  80. </select>
  81. </label>
  82. <div>
  83. <h3><a href="../contents.html">Table of Contents</a></h3>
  84. <ul>
  85. <li><a class="reference internal" href="#">Unicode HOWTO</a><ul>
  86. <li><a class="reference internal" href="#introduction-to-unicode">Introduction to Unicode</a><ul>
  87. <li><a class="reference internal" href="#definitions">Definitions</a></li>
  88. <li><a class="reference internal" href="#encodings">Encodings</a></li>
  89. <li><a class="reference internal" href="#references">References</a></li>
  90. </ul>
  91. </li>
  92. <li><a class="reference internal" href="#python-s-unicode-support">Python’s Unicode Support</a><ul>
  93. <li><a class="reference internal" href="#the-string-type">The String Type</a></li>
  94. <li><a class="reference internal" href="#converting-to-bytes">Converting to Bytes</a></li>
  95. <li><a class="reference internal" href="#unicode-literals-in-python-source-code">Unicode Literals in Python Source Code</a></li>
  96. <li><a class="reference internal" href="#unicode-properties">Unicode Properties</a></li>
  97. <li><a class="reference internal" href="#comparing-strings">Comparing Strings</a></li>
  98. <li><a class="reference internal" href="#unicode-regular-expressions">Unicode Regular Expressions</a></li>
  99. <li><a class="reference internal" href="#id2">References</a></li>
  100. </ul>
  101. </li>
  102. <li><a class="reference internal" href="#reading-and-writing-unicode-data">Reading and Writing Unicode Data</a><ul>
  103. <li><a class="reference internal" href="#unicode-filenames">Unicode filenames</a></li>
  104. <li><a class="reference internal" href="#tips-for-writing-unicode-aware-programs">Tips for Writing Unicode-aware Programs</a><ul>
  105. <li><a class="reference internal" href="#converting-between-file-encodings">Converting Between File Encodings</a></li>
  106. <li><a class="reference internal" href="#files-in-an-unknown-encoding">Files in an Unknown Encoding</a></li>
  107. </ul>
  108. </li>
  109. <li><a class="reference internal" href="#id3">References</a></li>
  110. </ul>
  111. </li>
  112. <li><a class="reference internal" href="#acknowledgements">Acknowledgements</a></li>
  113. </ul>
  114. </li>
  115. </ul>
  116. </div>
  117. <div>
  118. <h4>Previous topic</h4>
  119. <p class="topless"><a href="sorting.html"
  120. title="previous chapter">Sorting HOW TO</a></p>
  121. </div>
  122. <div>
  123. <h4>Next topic</h4>
  124. <p class="topless"><a href="urllib2.html"
  125. title="next chapter">HOWTO Fetch Internet Resources Using The urllib Package</a></p>
  126. </div>
  127. <div role="note" aria-label="source link">
  128. <h3>This Page</h3>
  129. <ul class="this-page-menu">
  130. <li><a href="../bugs.html">Report a Bug</a></li>
  131. <li>
  132. <a href="https://github.com/python/cpython/blob/main/Doc/howto/unicode.rst"
  133. rel="nofollow">Show Source
  134. </a>
  135. </li>
  136. </ul>
  137. </div>
  138. </nav>
  139. </div>
  140. </div>
  141. <div class="related" role="navigation" aria-label="related navigation">
  142. <h3>Navigation</h3>
  143. <ul>
  144. <li class="right" style="margin-right: 10px">
  145. <a href="../genindex.html" title="General Index"
  146. accesskey="I">index</a></li>
  147. <li class="right" >
  148. <a href="../py-modindex.html" title="Python Module Index"
  149. >modules</a> |</li>
  150. <li class="right" >
  151. <a href="urllib2.html" title="HOWTO Fetch Internet Resources Using The urllib Package"
  152. accesskey="N">next</a> |</li>
  153. <li class="right" >
  154. <a href="sorting.html" title="Sorting HOW TO"
  155. accesskey="P">previous</a> |</li>
  156. <li><img src="../_static/py.svg" alt="python logo" style="vertical-align: middle; margin-top: -1px"/></li>
  157. <li><a href="https://www.python.org/">Python</a> &#187;</li>
  158. <li class="switchers">
  159. <div class="language_switcher_placeholder"></div>
  160. <div class="version_switcher_placeholder"></div>
  161. </li>
  162. <li>
  163. </li>
  164. <li id="cpython-language-and-version">
  165. <a href="../index.html">3.12.0 Documentation</a> &#187;
  166. </li>
  167. <li class="nav-item nav-item-1"><a href="index.html" accesskey="U">Python HOWTOs</a> &#187;</li>
  168. <li class="nav-item nav-item-this"><a href="">Unicode HOWTO</a></li>
  169. <li class="right">
  170. <div class="inline-search" role="search">
  171. <form class="inline-search" action="../search.html" method="get">
  172. <input placeholder="Quick search" aria-label="Quick search" type="search" name="q" />
  173. <input type="submit" value="Go" />
  174. </form>
  175. </div>
  176. |
  177. </li>
  178. <li class="right">
  179. <label class="theme-selector-label">
  180. Theme
  181. <select class="theme-selector" oninput="activateTheme(this.value)">
  182. <option value="auto" selected>Auto</option>
  183. <option value="light">Light</option>
  184. <option value="dark">Dark</option>
  185. </select>
  186. </label> |</li>
  187. </ul>
  188. </div>
  189. <div class="document">
  190. <div class="documentwrapper">
  191. <div class="bodywrapper">
  192. <div class="body" role="main">
  193. <section id="unicode-howto">
  194. <span id="id1"></span><h1>Unicode HOWTO<a class="headerlink" href="#unicode-howto" title="Permalink to this headline">¶</a></h1>
  195. <dl class="field-list simple">
  196. <dt class="field-odd">Release</dt>
  197. <dd class="field-odd"><p>1.12</p>
  198. </dd>
  199. </dl>
  200. <p>This HOWTO discusses Python’s support for the Unicode specification
  201. for representing textual data, and explains various problems that
  202. people commonly encounter when trying to work with Unicode.</p>
  203. <section id="introduction-to-unicode">
  204. <h2>Introduction to Unicode<a class="headerlink" href="#introduction-to-unicode" title="Permalink to this headline">¶</a></h2>
  205. <section id="definitions">
  206. <h3>Definitions<a class="headerlink" href="#definitions" title="Permalink to this headline">¶</a></h3>
  207. <p>Today’s programs need to be able to handle a wide variety of
  208. characters. Applications are often internationalized to display
  209. messages and output in a variety of user-selectable languages; the
  210. same program might need to output an error message in English, French,
  211. Japanese, Hebrew, or Russian. Web content can be written in any of
  212. these languages and can also include a variety of emoji symbols.
  213. Python’s string type uses the Unicode Standard for representing
  214. characters, which lets Python programs work with all these different
  215. possible characters.</p>
  216. <p>Unicode (<a class="reference external" href="https://www.unicode.org/">https://www.unicode.org/</a>) is a specification that aims to
  217. list every character used by human languages and give each character
  218. its own unique code. The Unicode specifications are continually
  219. revised and updated to add new languages and symbols.</p>
  220. <p>A <strong>character</strong> is the smallest possible component of a text. ‘A’, ‘B’, ‘C’,
  221. etc., are all different characters. So are ‘È’ and ‘Í’. Characters vary
  222. depending on the language or context you’re talking
  223. about. For example, there’s a character for “Roman Numeral One”, ‘Ⅰ’, that’s
  224. separate from the uppercase letter ‘I’. They’ll usually look the same,
  225. but these are two different characters that have different meanings.</p>
  226. <p>The Unicode standard describes how characters are represented by
  227. <strong>code points</strong>. A code point value is an integer in the range 0 to
  228. 0x10FFFF (about 1.1 million values, the
  229. <a class="reference external" href="https://www.unicode.org/versions/latest/#Summary">actual number assigned</a>
  230. is less than that). In the standard and in this document, a code point is written
  231. using the notation <code class="docutils literal notranslate"><span class="pre">U+265E</span></code> to mean the character with value
  232. <code class="docutils literal notranslate"><span class="pre">0x265e</span></code> (9,822 in decimal).</p>
  233. <p>The Unicode standard contains a lot of tables listing characters and
  234. their corresponding code points:</p>
  235. <div class="highlight-none notranslate"><div class="highlight"><pre><span></span>0061 &#39;a&#39;; LATIN SMALL LETTER A
  236. 0062 &#39;b&#39;; LATIN SMALL LETTER B
  237. 0063 &#39;c&#39;; LATIN SMALL LETTER C
  238. ...
  239. 007B &#39;{&#39;; LEFT CURLY BRACKET
  240. ...
  241. 2167 &#39;Ⅷ&#39;; ROMAN NUMERAL EIGHT
  242. 2168 &#39;Ⅸ&#39;; ROMAN NUMERAL NINE
  243. ...
  244. 265E &#39;♞&#39;; BLACK CHESS KNIGHT
  245. 265F &#39;♟&#39;; BLACK CHESS PAWN
  246. ...
  247. 1F600 &#39;😀&#39;; GRINNING FACE
  248. 1F609 &#39;😉&#39;; WINKING FACE
  249. ...
  250. </pre></div>
  251. </div>
  252. <p>Strictly, these definitions imply that it’s meaningless to say ‘this is
  253. character <code class="docutils literal notranslate"><span class="pre">U+265E</span></code>’. <code class="docutils literal notranslate"><span class="pre">U+265E</span></code> is a code point, which represents some particular
  254. character; in this case, it represents the character ‘BLACK CHESS KNIGHT’,
  255. ‘♞’. In
  256. informal contexts, this distinction between code points and characters will
  257. sometimes be forgotten.</p>
  258. <p>A character is represented on a screen or on paper by a set of graphical
  259. elements that’s called a <strong>glyph</strong>. The glyph for an uppercase A, for example,
  260. is two diagonal strokes and a horizontal stroke, though the exact details will
  261. depend on the font being used. Most Python code doesn’t need to worry about
  262. glyphs; figuring out the correct glyph to display is generally the job of a GUI
  263. toolkit or a terminal’s font renderer.</p>
  264. </section>
  265. <section id="encodings">
  266. <h3>Encodings<a class="headerlink" href="#encodings" title="Permalink to this headline">¶</a></h3>
  267. <p>To summarize the previous section: a Unicode string is a sequence of
  268. code points, which are numbers from 0 through <code class="docutils literal notranslate"><span class="pre">0x10FFFF</span></code> (1,114,111
  269. decimal). This sequence of code points needs to be represented in
  270. memory as a set of <strong>code units</strong>, and <strong>code units</strong> are then mapped
  271. to 8-bit bytes. The rules for translating a Unicode string into a
  272. sequence of bytes are called a <strong>character encoding</strong>, or just
  273. an <strong>encoding</strong>.</p>
  274. <p>The first encoding you might think of is using 32-bit integers as the
  275. code unit, and then using the CPU’s representation of 32-bit integers.
  276. In this representation, the string “Python” might look like this:</p>
  277. <div class="highlight-none notranslate"><div class="highlight"><pre><span></span> P y t h o n
  278. 0x50 00 00 00 79 00 00 00 74 00 00 00 68 00 00 00 6f 00 00 00 6e 00 00 00
  279. 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
  280. </pre></div>
  281. </div>
  282. <p>This representation is straightforward but using it presents a number of
  283. problems.</p>
  284. <ol class="arabic simple">
  285. <li><p>It’s not portable; different processors order the bytes differently.</p></li>
  286. <li><p>It’s very wasteful of space. In most texts, the majority of the code points
  287. are less than 127, or less than 255, so a lot of space is occupied by <code class="docutils literal notranslate"><span class="pre">0x00</span></code>
  288. bytes. The above string takes 24 bytes compared to the 6 bytes needed for an
  289. ASCII representation. Increased RAM usage doesn’t matter too much (desktop
  290. computers have gigabytes of RAM, and strings aren’t usually that large), but
  291. expanding our usage of disk and network bandwidth by a factor of 4 is
  292. intolerable.</p></li>
  293. <li><p>It’s not compatible with existing C functions such as <code class="docutils literal notranslate"><span class="pre">strlen()</span></code>, so a new
  294. family of wide string functions would need to be used.</p></li>
  295. </ol>
  296. <p>Therefore this encoding isn’t used very much, and people instead choose other
  297. encodings that are more efficient and convenient, such as UTF-8.</p>
  298. <p>UTF-8 is one of the most commonly used encodings, and Python often
  299. defaults to using it. UTF stands for “Unicode Transformation Format”,
  300. and the ‘8’ means that 8-bit values are used in the encoding. (There
  301. are also UTF-16 and UTF-32 encodings, but they are less frequently
  302. used than UTF-8.) UTF-8 uses the following rules:</p>
  303. <ol class="arabic simple">
  304. <li><p>If the code point is &lt; 128, it’s represented by the corresponding byte value.</p></li>
  305. <li><p>If the code point is &gt;= 128, it’s turned into a sequence of two, three, or
  306. four bytes, where each byte of the sequence is between 128 and 255.</p></li>
  307. </ol>
  308. <p>UTF-8 has several convenient properties:</p>
  309. <ol class="arabic simple">
  310. <li><p>It can handle any Unicode code point.</p></li>
  311. <li><p>A Unicode string is turned into a sequence of bytes that contains embedded
  312. zero bytes only where they represent the null character (U+0000). This means
  313. that UTF-8 strings can be processed by C functions such as <code class="docutils literal notranslate"><span class="pre">strcpy()</span></code> and sent
  314. through protocols that can’t handle zero bytes for anything other than
  315. end-of-string markers.</p></li>
  316. <li><p>A string of ASCII text is also valid UTF-8 text.</p></li>
  317. <li><p>UTF-8 is fairly compact; the majority of commonly used characters can be
  318. represented with one or two bytes.</p></li>
  319. <li><p>If bytes are corrupted or lost, it’s possible to determine the start of the
  320. next UTF-8-encoded code point and resynchronize. It’s also unlikely that
  321. random 8-bit data will look like valid UTF-8.</p></li>
  322. <li><p>UTF-8 is a byte oriented encoding. The encoding specifies that each
  323. character is represented by a specific sequence of one or more bytes. This
  324. avoids the byte-ordering issues that can occur with integer and word oriented
  325. encodings, like UTF-16 and UTF-32, where the sequence of bytes varies depending
  326. on the hardware on which the string was encoded.</p></li>
  327. </ol>
  328. </section>
  329. <section id="references">
  330. <h3>References<a class="headerlink" href="#references" title="Permalink to this headline">¶</a></h3>
  331. <p>The <a class="reference external" href="https://www.unicode.org">Unicode Consortium site</a> has character charts, a
  332. glossary, and PDF versions of the Unicode specification. Be prepared for some
  333. difficult reading. <a class="reference external" href="https://www.unicode.org/history/">A chronology</a> of the
  334. origin and development of Unicode is also available on the site.</p>
  335. <p>On the Computerphile Youtube channel, Tom Scott briefly
  336. <a class="reference external" href="https://www.youtube.com/watch?v=MijmeoH9LT4">discusses the history of Unicode and UTF-8</a>
  337. (9 minutes 36 seconds).</p>
  338. <p>To help understand the standard, Jukka Korpela has written <a class="reference external" href="https://jkorpela.fi/unicode/guide.html">an introductory
  339. guide</a> to reading the
  340. Unicode character tables.</p>
  341. <p>Another <a class="reference external" href="https://www.joelonsoftware.com/2003/10/08/the-absolute-minimum-every-software-developer-absolutely-positively-must-know-about-unicode-and-character-sets-no-excuses/">good introductory article</a>
  342. was written by Joel Spolsky.
  343. If this introduction didn’t make things clear to you, you should try
  344. reading this alternate article before continuing.</p>
  345. <p>Wikipedia entries are often helpful; see the entries for “<a class="reference external" href="https://en.wikipedia.org/wiki/Character_encoding">character encoding</a>” and <a class="reference external" href="https://en.wikipedia.org/wiki/UTF-8">UTF-8</a>, for example.</p>
  346. </section>
  347. </section>
  348. <section id="python-s-unicode-support">
  349. <h2>Python’s Unicode Support<a class="headerlink" href="#python-s-unicode-support" title="Permalink to this headline">¶</a></h2>
  350. <p>Now that you’ve learned the rudiments of Unicode, we can look at Python’s
  351. Unicode features.</p>
  352. <section id="the-string-type">
  353. <h3>The String Type<a class="headerlink" href="#the-string-type" title="Permalink to this headline">¶</a></h3>
  354. <p>Since Python 3.0, the language’s <a class="reference internal" href="../library/stdtypes.html#str" title="str"><code class="xref py py-class docutils literal notranslate"><span class="pre">str</span></code></a> type contains Unicode
  355. characters, meaning any string created using <code class="docutils literal notranslate"><span class="pre">&quot;unicode</span> <span class="pre">rocks!&quot;</span></code>, <code class="docutils literal notranslate"><span class="pre">'unicode</span>
  356. <span class="pre">rocks!'</span></code>, or the triple-quoted string syntax is stored as Unicode.</p>
  357. <p>The default encoding for Python source code is UTF-8, so you can simply
  358. include a Unicode character in a string literal:</p>
  359. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="k">try</span><span class="p">:</span>
  360. <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="s1">&#39;/tmp/input.txt&#39;</span><span class="p">,</span> <span class="s1">&#39;r&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
  361. <span class="o">...</span>
  362. <span class="k">except</span> <span class="ne">OSError</span><span class="p">:</span>
  363. <span class="c1"># &#39;File not found&#39; error message.</span>
  364. <span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Fichier non trouvé&quot;</span><span class="p">)</span>
  365. </pre></div>
  366. </div>
  367. <p>Side note: Python 3 also supports using Unicode characters in identifiers:</p>
  368. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="n">répertoire</span> <span class="o">=</span> <span class="s2">&quot;/tmp/records.log&quot;</span>
  369. <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">répertoire</span><span class="p">,</span> <span class="s2">&quot;w&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
  370. <span class="n">f</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s2">&quot;test</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">)</span>
  371. </pre></div>
  372. </div>
  373. <p>If you can’t enter a particular character in your editor or want to
  374. keep the source code ASCII-only for some reason, you can also use
  375. escape sequences in string literals. (Depending on your system,
  376. you may see the actual capital-delta glyph instead of a u escape.)</p>
  377. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="s2">&quot;</span><span class="se">\N{GREEK CAPITAL LETTER DELTA}</span><span class="s2">&quot;</span> <span class="c1"># Using the character name</span>
  378. <span class="go">&#39;\u0394&#39;</span>
  379. <span class="gp">&gt;&gt;&gt; </span><span class="s2">&quot;</span><span class="se">\u0394</span><span class="s2">&quot;</span> <span class="c1"># Using a 16-bit hex value</span>
  380. <span class="go">&#39;\u0394&#39;</span>
  381. <span class="gp">&gt;&gt;&gt; </span><span class="s2">&quot;</span><span class="se">\U00000394</span><span class="s2">&quot;</span> <span class="c1"># Using a 32-bit hex value</span>
  382. <span class="go">&#39;\u0394&#39;</span>
  383. </pre></div>
  384. </div>
  385. <p>In addition, one can create a string using the <a class="reference internal" href="../library/stdtypes.html#bytes.decode" title="bytes.decode"><code class="xref py py-func docutils literal notranslate"><span class="pre">decode()</span></code></a> method of
  386. <a class="reference internal" href="../library/stdtypes.html#bytes" title="bytes"><code class="xref py py-class docutils literal notranslate"><span class="pre">bytes</span></code></a>. This method takes an <em>encoding</em> argument, such as <code class="docutils literal notranslate"><span class="pre">UTF-8</span></code>,
  387. and optionally an <em>errors</em> argument.</p>
  388. <p>The <em>errors</em> argument specifies the response when the input string can’t be
  389. converted according to the encoding’s rules. Legal values for this argument are
  390. <code class="docutils literal notranslate"><span class="pre">'strict'</span></code> (raise a <a class="reference internal" href="../library/exceptions.html#UnicodeDecodeError" title="UnicodeDecodeError"><code class="xref py py-exc docutils literal notranslate"><span class="pre">UnicodeDecodeError</span></code></a> exception), <code class="docutils literal notranslate"><span class="pre">'replace'</span></code> (use
  391. <code class="docutils literal notranslate"><span class="pre">U+FFFD</span></code>, <code class="docutils literal notranslate"><span class="pre">REPLACEMENT</span> <span class="pre">CHARACTER</span></code>), <code class="docutils literal notranslate"><span class="pre">'ignore'</span></code> (just leave the
  392. character out of the Unicode result), or <code class="docutils literal notranslate"><span class="pre">'backslashreplace'</span></code> (inserts a
  393. <code class="docutils literal notranslate"><span class="pre">\xNN</span></code> escape sequence).
  394. The following examples show the differences:</p>
  395. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="sa">b</span><span class="s1">&#39;</span><span class="se">\x80</span><span class="s1">abc&#39;</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s2">&quot;utf-8&quot;</span><span class="p">,</span> <span class="s2">&quot;strict&quot;</span><span class="p">)</span>
  396. <span class="gt">Traceback (most recent call last):</span>
  397. <span class="w"> </span><span class="o">...</span>
  398. <span class="gr">UnicodeDecodeError</span>: <span class="n">&#39;utf-8&#39; codec can&#39;t decode byte 0x80 in position 0:</span>
  399. <span class="x"> invalid start byte</span>
  400. <span class="gp">&gt;&gt;&gt; </span><span class="sa">b</span><span class="s1">&#39;</span><span class="se">\x80</span><span class="s1">abc&#39;</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s2">&quot;utf-8&quot;</span><span class="p">,</span> <span class="s2">&quot;replace&quot;</span><span class="p">)</span>
  401. <span class="go">&#39;\ufffdabc&#39;</span>
  402. <span class="gp">&gt;&gt;&gt; </span><span class="sa">b</span><span class="s1">&#39;</span><span class="se">\x80</span><span class="s1">abc&#39;</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s2">&quot;utf-8&quot;</span><span class="p">,</span> <span class="s2">&quot;backslashreplace&quot;</span><span class="p">)</span>
  403. <span class="go">&#39;\\x80abc&#39;</span>
  404. <span class="gp">&gt;&gt;&gt; </span><span class="sa">b</span><span class="s1">&#39;</span><span class="se">\x80</span><span class="s1">abc&#39;</span><span class="o">.</span><span class="n">decode</span><span class="p">(</span><span class="s2">&quot;utf-8&quot;</span><span class="p">,</span> <span class="s2">&quot;ignore&quot;</span><span class="p">)</span>
  405. <span class="go">&#39;abc&#39;</span>
  406. </pre></div>
  407. </div>
  408. <p>Encodings are specified as strings containing the encoding’s name. Python
  409. comes with roughly 100 different encodings; see the Python Library Reference at
  410. <a class="reference internal" href="../library/codecs.html#standard-encodings"><span class="std std-ref">Standard Encodings</span></a> for a list. Some encodings have multiple names; for
  411. example, <code class="docutils literal notranslate"><span class="pre">'latin-1'</span></code>, <code class="docutils literal notranslate"><span class="pre">'iso_8859_1'</span></code> and <code class="docutils literal notranslate"><span class="pre">'8859</span></code>’ are all synonyms for
  412. the same encoding.</p>
  413. <p>One-character Unicode strings can also be created with the <a class="reference internal" href="../library/functions.html#chr" title="chr"><code class="xref py py-func docutils literal notranslate"><span class="pre">chr()</span></code></a>
  414. built-in function, which takes integers and returns a Unicode string of length 1
  415. that contains the corresponding code point. The reverse operation is the
  416. built-in <a class="reference internal" href="../library/functions.html#ord" title="ord"><code class="xref py py-func docutils literal notranslate"><span class="pre">ord()</span></code></a> function that takes a one-character Unicode string and
  417. returns the code point value:</p>
  418. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="nb">chr</span><span class="p">(</span><span class="mi">57344</span><span class="p">)</span>
  419. <span class="go">&#39;\ue000&#39;</span>
  420. <span class="gp">&gt;&gt;&gt; </span><span class="nb">ord</span><span class="p">(</span><span class="s1">&#39;</span><span class="se">\ue000</span><span class="s1">&#39;</span><span class="p">)</span>
  421. <span class="go">57344</span>
  422. </pre></div>
  423. </div>
  424. </section>
  425. <section id="converting-to-bytes">
  426. <h3>Converting to Bytes<a class="headerlink" href="#converting-to-bytes" title="Permalink to this headline">¶</a></h3>
  427. <p>The opposite method of <a class="reference internal" href="../library/stdtypes.html#bytes.decode" title="bytes.decode"><code class="xref py py-meth docutils literal notranslate"><span class="pre">bytes.decode()</span></code></a> is <a class="reference internal" href="../library/stdtypes.html#str.encode" title="str.encode"><code class="xref py py-meth docutils literal notranslate"><span class="pre">str.encode()</span></code></a>,
  428. which returns a <a class="reference internal" href="../library/stdtypes.html#bytes" title="bytes"><code class="xref py py-class docutils literal notranslate"><span class="pre">bytes</span></code></a> representation of the Unicode string, encoded in the
  429. requested <em>encoding</em>.</p>
  430. <p>The <em>errors</em> parameter is the same as the parameter of the
  431. <a class="reference internal" href="../library/stdtypes.html#bytes.decode" title="bytes.decode"><code class="xref py py-meth docutils literal notranslate"><span class="pre">decode()</span></code></a> method but supports a few more possible handlers. As well as
  432. <code class="docutils literal notranslate"><span class="pre">'strict'</span></code>, <code class="docutils literal notranslate"><span class="pre">'ignore'</span></code>, and <code class="docutils literal notranslate"><span class="pre">'replace'</span></code> (which in this case
  433. inserts a question mark instead of the unencodable character), there is
  434. also <code class="docutils literal notranslate"><span class="pre">'xmlcharrefreplace'</span></code> (inserts an XML character reference),
  435. <code class="docutils literal notranslate"><span class="pre">backslashreplace</span></code> (inserts a <code class="docutils literal notranslate"><span class="pre">\uNNNN</span></code> escape sequence) and
  436. <code class="docutils literal notranslate"><span class="pre">namereplace</span></code> (inserts a <code class="docutils literal notranslate"><span class="pre">\N{...}</span></code> escape sequence).</p>
  437. <p>The following example shows the different results:</p>
  438. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">u</span> <span class="o">=</span> <span class="nb">chr</span><span class="p">(</span><span class="mi">40960</span><span class="p">)</span> <span class="o">+</span> <span class="s1">&#39;abcd&#39;</span> <span class="o">+</span> <span class="nb">chr</span><span class="p">(</span><span class="mi">1972</span><span class="p">)</span>
  439. <span class="gp">&gt;&gt;&gt; </span><span class="n">u</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s1">&#39;utf-8&#39;</span><span class="p">)</span>
  440. <span class="go">b&#39;\xea\x80\x80abcd\xde\xb4&#39;</span>
  441. <span class="gp">&gt;&gt;&gt; </span><span class="n">u</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s1">&#39;ascii&#39;</span><span class="p">)</span>
  442. <span class="gt">Traceback (most recent call last):</span>
  443. <span class="w"> </span><span class="o">...</span>
  444. <span class="gr">UnicodeEncodeError</span>: <span class="n">&#39;ascii&#39; codec can&#39;t encode character &#39;\ua000&#39; in</span>
  445. <span class="x"> position 0: ordinal not in range(128)</span>
  446. <span class="gp">&gt;&gt;&gt; </span><span class="n">u</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s1">&#39;ascii&#39;</span><span class="p">,</span> <span class="s1">&#39;ignore&#39;</span><span class="p">)</span>
  447. <span class="go">b&#39;abcd&#39;</span>
  448. <span class="gp">&gt;&gt;&gt; </span><span class="n">u</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s1">&#39;ascii&#39;</span><span class="p">,</span> <span class="s1">&#39;replace&#39;</span><span class="p">)</span>
  449. <span class="go">b&#39;?abcd?&#39;</span>
  450. <span class="gp">&gt;&gt;&gt; </span><span class="n">u</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s1">&#39;ascii&#39;</span><span class="p">,</span> <span class="s1">&#39;xmlcharrefreplace&#39;</span><span class="p">)</span>
  451. <span class="go">b&#39;&amp;#40960;abcd&amp;#1972;&#39;</span>
  452. <span class="gp">&gt;&gt;&gt; </span><span class="n">u</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s1">&#39;ascii&#39;</span><span class="p">,</span> <span class="s1">&#39;backslashreplace&#39;</span><span class="p">)</span>
  453. <span class="go">b&#39;\\ua000abcd\\u07b4&#39;</span>
  454. <span class="gp">&gt;&gt;&gt; </span><span class="n">u</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s1">&#39;ascii&#39;</span><span class="p">,</span> <span class="s1">&#39;namereplace&#39;</span><span class="p">)</span>
  455. <span class="go">b&#39;\\N{YI SYLLABLE IT}abcd\\u07b4&#39;</span>
  456. </pre></div>
  457. </div>
  458. <p>The low-level routines for registering and accessing the available
  459. encodings are found in the <a class="reference internal" href="../library/codecs.html#module-codecs" title="codecs: Encode and decode data and streams."><code class="xref py py-mod docutils literal notranslate"><span class="pre">codecs</span></code></a> module. Implementing new
  460. encodings also requires understanding the <a class="reference internal" href="../library/codecs.html#module-codecs" title="codecs: Encode and decode data and streams."><code class="xref py py-mod docutils literal notranslate"><span class="pre">codecs</span></code></a> module.
  461. However, the encoding and decoding functions returned by this module
  462. are usually more low-level than is comfortable, and writing new encodings
  463. is a specialized task, so the module won’t be covered in this HOWTO.</p>
  464. </section>
  465. <section id="unicode-literals-in-python-source-code">
  466. <h3>Unicode Literals in Python Source Code<a class="headerlink" href="#unicode-literals-in-python-source-code" title="Permalink to this headline">¶</a></h3>
  467. <p>In Python source code, specific Unicode code points can be written using the
  468. <code class="docutils literal notranslate"><span class="pre">\u</span></code> escape sequence, which is followed by four hex digits giving the code
  469. point. The <code class="docutils literal notranslate"><span class="pre">\U</span></code> escape sequence is similar, but expects eight hex digits,
  470. not four:</p>
  471. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">s</span> <span class="o">=</span> <span class="s2">&quot;a</span><span class="se">\xac\u1234\u20ac\U00008000</span><span class="s2">&quot;</span>
  472. <span class="gp">... </span><span class="c1"># ^^^^ two-digit hex escape</span>
  473. <span class="gp">... </span><span class="c1"># ^^^^^^ four-digit Unicode escape</span>
  474. <span class="gp">... </span><span class="c1"># ^^^^^^^^^^ eight-digit Unicode escape</span>
  475. <span class="gp">&gt;&gt;&gt; </span><span class="p">[</span><span class="nb">ord</span><span class="p">(</span><span class="n">c</span><span class="p">)</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">s</span><span class="p">]</span>
  476. <span class="go">[97, 172, 4660, 8364, 32768]</span>
  477. </pre></div>
  478. </div>
  479. <p>Using escape sequences for code points greater than 127 is fine in small doses,
  480. but becomes an annoyance if you’re using many accented characters, as you would
  481. in a program with messages in French or some other accent-using language. You
  482. can also assemble strings using the <a class="reference internal" href="../library/functions.html#chr" title="chr"><code class="xref py py-func docutils literal notranslate"><span class="pre">chr()</span></code></a> built-in function, but this is
  483. even more tedious.</p>
  484. <p>Ideally, you’d want to be able to write literals in your language’s natural
  485. encoding. You could then edit Python source code with your favorite editor
  486. which would display the accented characters naturally, and have the right
  487. characters used at runtime.</p>
  488. <p>Python supports writing source code in UTF-8 by default, but you can use almost
  489. any encoding if you declare the encoding being used. This is done by including
  490. a special comment as either the first or second line of the source file:</p>
  491. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="ch">#!/usr/bin/env python</span>
  492. <span class="c1"># -*- coding: latin-1 -*-</span>
  493. <span class="n">u</span> <span class="o">=</span> <span class="s1">&#39;abcdé&#39;</span>
  494. <span class="nb">print</span><span class="p">(</span><span class="nb">ord</span><span class="p">(</span><span class="n">u</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]))</span>
  495. </pre></div>
  496. </div>
  497. <p>The syntax is inspired by Emacs’s notation for specifying variables local to a
  498. file. Emacs supports many different variables, but Python only supports
  499. ‘coding’. The <code class="docutils literal notranslate"><span class="pre">-*-</span></code> symbols indicate to Emacs that the comment is special;
  500. they have no significance to Python but are a convention. Python looks for
  501. <code class="docutils literal notranslate"><span class="pre">coding:</span> <span class="pre">name</span></code> or <code class="docutils literal notranslate"><span class="pre">coding=name</span></code> in the comment.</p>
  502. <p>If you don’t include such a comment, the default encoding used will be UTF-8 as
  503. already mentioned. See also <span class="target" id="index-0"></span><a class="pep reference external" href="https://peps.python.org/pep-0263/"><strong>PEP 263</strong></a> for more information.</p>
  504. </section>
  505. <section id="unicode-properties">
  506. <h3>Unicode Properties<a class="headerlink" href="#unicode-properties" title="Permalink to this headline">¶</a></h3>
  507. <p>The Unicode specification includes a database of information about
  508. code points. For each defined code point, the information includes
  509. the character’s name, its category, the numeric value if applicable
  510. (for characters representing numeric concepts such as the Roman
  511. numerals, fractions such as one-third and four-fifths, etc.). There
  512. are also display-related properties, such as how to use the code point
  513. in bidirectional text.</p>
  514. <p>The following program displays some information about several characters, and
  515. prints the numeric value of one particular character:</p>
  516. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">unicodedata</span>
  517. <span class="n">u</span> <span class="o">=</span> <span class="nb">chr</span><span class="p">(</span><span class="mi">233</span><span class="p">)</span> <span class="o">+</span> <span class="nb">chr</span><span class="p">(</span><span class="mh">0x0bf2</span><span class="p">)</span> <span class="o">+</span> <span class="nb">chr</span><span class="p">(</span><span class="mi">3972</span><span class="p">)</span> <span class="o">+</span> <span class="nb">chr</span><span class="p">(</span><span class="mi">6000</span><span class="p">)</span> <span class="o">+</span> <span class="nb">chr</span><span class="p">(</span><span class="mi">13231</span><span class="p">)</span>
  518. <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">c</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">u</span><span class="p">):</span>
  519. <span class="nb">print</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="s1">&#39;</span><span class="si">%04x</span><span class="s1">&#39;</span> <span class="o">%</span> <span class="nb">ord</span><span class="p">(</span><span class="n">c</span><span class="p">),</span> <span class="n">unicodedata</span><span class="o">.</span><span class="n">category</span><span class="p">(</span><span class="n">c</span><span class="p">),</span> <span class="n">end</span><span class="o">=</span><span class="s2">&quot; &quot;</span><span class="p">)</span>
  520. <span class="nb">print</span><span class="p">(</span><span class="n">unicodedata</span><span class="o">.</span><span class="n">name</span><span class="p">(</span><span class="n">c</span><span class="p">))</span>
  521. <span class="c1"># Get numeric value of second character</span>
  522. <span class="nb">print</span><span class="p">(</span><span class="n">unicodedata</span><span class="o">.</span><span class="n">numeric</span><span class="p">(</span><span class="n">u</span><span class="p">[</span><span class="mi">1</span><span class="p">]))</span>
  523. </pre></div>
  524. </div>
  525. <p>When run, this prints:</p>
  526. <div class="highlight-none notranslate"><div class="highlight"><pre><span></span>0 00e9 Ll LATIN SMALL LETTER E WITH ACUTE
  527. 1 0bf2 No TAMIL NUMBER ONE THOUSAND
  528. 2 0f84 Mn TIBETAN MARK HALANTA
  529. 3 1770 Lo TAGBANWA LETTER SA
  530. 4 33af So SQUARE RAD OVER S SQUARED
  531. 1000.0
  532. </pre></div>
  533. </div>
  534. <p>The category codes are abbreviations describing the nature of the character.
  535. These are grouped into categories such as “Letter”, “Number”, “Punctuation”, or
  536. “Symbol”, which in turn are broken up into subcategories. To take the codes
  537. from the above output, <code class="docutils literal notranslate"><span class="pre">'Ll'</span></code> means ‘Letter, lowercase’, <code class="docutils literal notranslate"><span class="pre">'No'</span></code> means
  538. “Number, other”, <code class="docutils literal notranslate"><span class="pre">'Mn'</span></code> is “Mark, nonspacing”, and <code class="docutils literal notranslate"><span class="pre">'So'</span></code> is “Symbol,
  539. other”. See
  540. <a class="reference external" href="https://www.unicode.org/reports/tr44/#General_Category_Values">the General Category Values section of the Unicode Character Database documentation</a> for a
  541. list of category codes.</p>
  542. </section>
  543. <section id="comparing-strings">
  544. <h3>Comparing Strings<a class="headerlink" href="#comparing-strings" title="Permalink to this headline">¶</a></h3>
  545. <p>Unicode adds some complication to comparing strings, because the same
  546. set of characters can be represented by different sequences of code
  547. points. For example, a letter like ‘ê’ can be represented as a single
  548. code point U+00EA, or as U+0065 U+0302, which is the code point for
  549. ‘e’ followed by a code point for ‘COMBINING CIRCUMFLEX ACCENT’. These
  550. will produce the same output when printed, but one is a string of
  551. length 1 and the other is of length 2.</p>
  552. <p>One tool for a case-insensitive comparison is the
  553. <a class="reference internal" href="../library/stdtypes.html#str.casefold" title="str.casefold"><code class="xref py py-meth docutils literal notranslate"><span class="pre">casefold()</span></code></a> string method that converts a string to a
  554. case-insensitive form following an algorithm described by the Unicode
  555. Standard. This algorithm has special handling for characters such as
  556. the German letter ‘ß’ (code point U+00DF), which becomes the pair of
  557. lowercase letters ‘ss’.</p>
  558. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">street</span> <span class="o">=</span> <span class="s1">&#39;Gürzenichstraße&#39;</span>
  559. <span class="gp">&gt;&gt;&gt; </span><span class="n">street</span><span class="o">.</span><span class="n">casefold</span><span class="p">()</span>
  560. <span class="go">&#39;gürzenichstrasse&#39;</span>
  561. </pre></div>
  562. </div>
  563. <p>A second tool is the <a class="reference internal" href="../library/unicodedata.html#module-unicodedata" title="unicodedata: Access the Unicode Database."><code class="xref py py-mod docutils literal notranslate"><span class="pre">unicodedata</span></code></a> module’s
  564. <a class="reference internal" href="../library/unicodedata.html#unicodedata.normalize" title="unicodedata.normalize"><code class="xref py py-func docutils literal notranslate"><span class="pre">normalize()</span></code></a> function that converts strings to one
  565. of several normal forms, where letters followed by a combining character are
  566. replaced with single characters. <a class="reference internal" href="../library/unicodedata.html#unicodedata.normalize" title="unicodedata.normalize"><code class="xref py py-func docutils literal notranslate"><span class="pre">normalize()</span></code></a> can
  567. be used to perform string comparisons that won’t falsely report
  568. inequality if two strings use combining characters differently:</p>
  569. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">unicodedata</span>
  570. <span class="k">def</span> <span class="nf">compare_strs</span><span class="p">(</span><span class="n">s1</span><span class="p">,</span> <span class="n">s2</span><span class="p">):</span>
  571. <span class="k">def</span> <span class="nf">NFD</span><span class="p">(</span><span class="n">s</span><span class="p">):</span>
  572. <span class="k">return</span> <span class="n">unicodedata</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="s1">&#39;NFD&#39;</span><span class="p">,</span> <span class="n">s</span><span class="p">)</span>
  573. <span class="k">return</span> <span class="n">NFD</span><span class="p">(</span><span class="n">s1</span><span class="p">)</span> <span class="o">==</span> <span class="n">NFD</span><span class="p">(</span><span class="n">s2</span><span class="p">)</span>
  574. <span class="n">single_char</span> <span class="o">=</span> <span class="s1">&#39;ê&#39;</span>
  575. <span class="n">multiple_chars</span> <span class="o">=</span> <span class="s1">&#39;</span><span class="se">\N{LATIN SMALL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}</span><span class="s1">&#39;</span>
  576. <span class="nb">print</span><span class="p">(</span><span class="s1">&#39;length of first string=&#39;</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">single_char</span><span class="p">))</span>
  577. <span class="nb">print</span><span class="p">(</span><span class="s1">&#39;length of second string=&#39;</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">multiple_chars</span><span class="p">))</span>
  578. <span class="nb">print</span><span class="p">(</span><span class="n">compare_strs</span><span class="p">(</span><span class="n">single_char</span><span class="p">,</span> <span class="n">multiple_chars</span><span class="p">))</span>
  579. </pre></div>
  580. </div>
  581. <p>When run, this outputs:</p>
  582. <div class="highlight-shell-session notranslate"><div class="highlight"><pre><span></span><span class="gp">$ </span>python<span class="w"> </span>compare-strs.py
  583. <span class="go">length of first string= 1</span>
  584. <span class="go">length of second string= 2</span>
  585. <span class="go">True</span>
  586. </pre></div>
  587. </div>
  588. <p>The first argument to the <a class="reference internal" href="../library/unicodedata.html#unicodedata.normalize" title="unicodedata.normalize"><code class="xref py py-func docutils literal notranslate"><span class="pre">normalize()</span></code></a> function is a
  589. string giving the desired normalization form, which can be one of
  590. ‘NFC’, ‘NFKC’, ‘NFD’, and ‘NFKD’.</p>
  591. <p>The Unicode Standard also specifies how to do caseless comparisons:</p>
  592. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">unicodedata</span>
  593. <span class="k">def</span> <span class="nf">compare_caseless</span><span class="p">(</span><span class="n">s1</span><span class="p">,</span> <span class="n">s2</span><span class="p">):</span>
  594. <span class="k">def</span> <span class="nf">NFD</span><span class="p">(</span><span class="n">s</span><span class="p">):</span>
  595. <span class="k">return</span> <span class="n">unicodedata</span><span class="o">.</span><span class="n">normalize</span><span class="p">(</span><span class="s1">&#39;NFD&#39;</span><span class="p">,</span> <span class="n">s</span><span class="p">)</span>
  596. <span class="k">return</span> <span class="n">NFD</span><span class="p">(</span><span class="n">NFD</span><span class="p">(</span><span class="n">s1</span><span class="p">)</span><span class="o">.</span><span class="n">casefold</span><span class="p">())</span> <span class="o">==</span> <span class="n">NFD</span><span class="p">(</span><span class="n">NFD</span><span class="p">(</span><span class="n">s2</span><span class="p">)</span><span class="o">.</span><span class="n">casefold</span><span class="p">())</span>
  597. <span class="c1"># Example usage</span>
  598. <span class="n">single_char</span> <span class="o">=</span> <span class="s1">&#39;ê&#39;</span>
  599. <span class="n">multiple_chars</span> <span class="o">=</span> <span class="s1">&#39;</span><span class="se">\N{LATIN CAPITAL LETTER E}\N{COMBINING CIRCUMFLEX ACCENT}</span><span class="s1">&#39;</span>
  600. <span class="nb">print</span><span class="p">(</span><span class="n">compare_caseless</span><span class="p">(</span><span class="n">single_char</span><span class="p">,</span> <span class="n">multiple_chars</span><span class="p">))</span>
  601. </pre></div>
  602. </div>
  603. <p>This will print <code class="docutils literal notranslate"><span class="pre">True</span></code>. (Why is <code class="xref py py-func docutils literal notranslate"><span class="pre">NFD()</span></code> invoked twice? Because
  604. there are a few characters that make <a class="reference internal" href="../library/stdtypes.html#str.casefold" title="str.casefold"><code class="xref py py-meth docutils literal notranslate"><span class="pre">casefold()</span></code></a> return a
  605. non-normalized string, so the result needs to be normalized again. See
  606. section 3.13 of the Unicode Standard for a discussion and an example.)</p>
  607. </section>
  608. <section id="unicode-regular-expressions">
  609. <h3>Unicode Regular Expressions<a class="headerlink" href="#unicode-regular-expressions" title="Permalink to this headline">¶</a></h3>
  610. <p>The regular expressions supported by the <a class="reference internal" href="../library/re.html#module-re" title="re: Regular expression operations."><code class="xref py py-mod docutils literal notranslate"><span class="pre">re</span></code></a> module can be provided
  611. either as bytes or strings. Some of the special character sequences such as
  612. <code class="docutils literal notranslate"><span class="pre">\d</span></code> and <code class="docutils literal notranslate"><span class="pre">\w</span></code> have different meanings depending on whether
  613. the pattern is supplied as bytes or a string. For example,
  614. <code class="docutils literal notranslate"><span class="pre">\d</span></code> will match the characters <code class="docutils literal notranslate"><span class="pre">[0-9]</span></code> in bytes but
  615. in strings will match any character that’s in the <code class="docutils literal notranslate"><span class="pre">'Nd'</span></code> category.</p>
  616. <p>The string in this example has the number 57 written in both Thai and
  617. Arabic numerals:</p>
  618. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">re</span>
  619. <span class="n">p</span> <span class="o">=</span> <span class="n">re</span><span class="o">.</span><span class="n">compile</span><span class="p">(</span><span class="sa">r</span><span class="s1">&#39;\d+&#39;</span><span class="p">)</span>
  620. <span class="n">s</span> <span class="o">=</span> <span class="s2">&quot;Over </span><span class="se">\u0e55\u0e57</span><span class="s2"> 57 flavours&quot;</span>
  621. <span class="n">m</span> <span class="o">=</span> <span class="n">p</span><span class="o">.</span><span class="n">search</span><span class="p">(</span><span class="n">s</span><span class="p">)</span>
  622. <span class="nb">print</span><span class="p">(</span><span class="nb">repr</span><span class="p">(</span><span class="n">m</span><span class="o">.</span><span class="n">group</span><span class="p">()))</span>
  623. </pre></div>
  624. </div>
  625. <p>When executed, <code class="docutils literal notranslate"><span class="pre">\d+</span></code> will match the Thai numerals and print them
  626. out. If you supply the <a class="reference internal" href="../library/re.html#re.ASCII" title="re.ASCII"><code class="xref py py-const docutils literal notranslate"><span class="pre">re.ASCII</span></code></a> flag to
  627. <a class="reference internal" href="../library/re.html#re.compile" title="re.compile"><code class="xref py py-func docutils literal notranslate"><span class="pre">compile()</span></code></a>, <code class="docutils literal notranslate"><span class="pre">\d+</span></code> will match the substring “57” instead.</p>
  628. <p>Similarly, <code class="docutils literal notranslate"><span class="pre">\w</span></code> matches a wide variety of Unicode characters but
  629. only <code class="docutils literal notranslate"><span class="pre">[a-zA-Z0-9_]</span></code> in bytes or if <a class="reference internal" href="../library/re.html#re.ASCII" title="re.ASCII"><code class="xref py py-const docutils literal notranslate"><span class="pre">re.ASCII</span></code></a> is supplied,
  630. and <code class="docutils literal notranslate"><span class="pre">\s</span></code> will match either Unicode whitespace characters or
  631. <code class="docutils literal notranslate"><span class="pre">[</span> <span class="pre">\t\n\r\f\v]</span></code>.</p>
  632. </section>
  633. <section id="id2">
  634. <h3>References<a class="headerlink" href="#id2" title="Permalink to this headline">¶</a></h3>
  635. <p>Some good alternative discussions of Python’s Unicode support are:</p>
  636. <ul class="simple">
  637. <li><p><a class="reference external" href="https://python-notes.curiousefficiency.org/en/latest/python3/text_file_processing.html">Processing Text Files in Python 3</a>, by Nick Coghlan.</p></li>
  638. <li><p><a class="reference external" href="https://nedbatchelder.com/text/unipain.html">Pragmatic Unicode</a>, a PyCon 2012 presentation by Ned Batchelder.</p></li>
  639. </ul>
  640. <p>The <a class="reference internal" href="../library/stdtypes.html#str" title="str"><code class="xref py py-class docutils literal notranslate"><span class="pre">str</span></code></a> type is described in the Python library reference at
  641. <a class="reference internal" href="../library/stdtypes.html#textseq"><span class="std std-ref">Text Sequence Type — str</span></a>.</p>
  642. <p>The documentation for the <a class="reference internal" href="../library/unicodedata.html#module-unicodedata" title="unicodedata: Access the Unicode Database."><code class="xref py py-mod docutils literal notranslate"><span class="pre">unicodedata</span></code></a> module.</p>
  643. <p>The documentation for the <a class="reference internal" href="../library/codecs.html#module-codecs" title="codecs: Encode and decode data and streams."><code class="xref py py-mod docutils literal notranslate"><span class="pre">codecs</span></code></a> module.</p>
  644. <p>Marc-André Lemburg gave <a class="reference external" href="https://downloads.egenix.com/python/Unicode-EPC2002-Talk.pdf">a presentation titled “Python and Unicode” (PDF slides)</a> at
  645. EuroPython 2002. The slides are an excellent overview of the design of Python
  646. 2’s Unicode features (where the Unicode string type is called <code class="docutils literal notranslate"><span class="pre">unicode</span></code> and
  647. literals start with <code class="docutils literal notranslate"><span class="pre">u</span></code>).</p>
  648. </section>
  649. </section>
  650. <section id="reading-and-writing-unicode-data">
  651. <h2>Reading and Writing Unicode Data<a class="headerlink" href="#reading-and-writing-unicode-data" title="Permalink to this headline">¶</a></h2>
  652. <p>Once you’ve written some code that works with Unicode data, the next problem is
  653. input/output. How do you get Unicode strings into your program, and how do you
  654. convert Unicode into a form suitable for storage or transmission?</p>
  655. <p>It’s possible that you may not need to do anything depending on your input
  656. sources and output destinations; you should check whether the libraries used in
  657. your application support Unicode natively. XML parsers often return Unicode
  658. data, for example. Many relational databases also support Unicode-valued
  659. columns and can return Unicode values from an SQL query.</p>
  660. <p>Unicode data is usually converted to a particular encoding before it gets
  661. written to disk or sent over a socket. It’s possible to do all the work
  662. yourself: open a file, read an 8-bit bytes object from it, and convert the bytes
  663. with <code class="docutils literal notranslate"><span class="pre">bytes.decode(encoding)</span></code>. However, the manual approach is not recommended.</p>
  664. <p>One problem is the multi-byte nature of encodings; one Unicode character can be
  665. represented by several bytes. If you want to read the file in arbitrary-sized
  666. chunks (say, 1024 or 4096 bytes), you need to write error-handling code to catch the case
  667. where only part of the bytes encoding a single Unicode character are read at the
  668. end of a chunk. One solution would be to read the entire file into memory and
  669. then perform the decoding, but that prevents you from working with files that
  670. are extremely large; if you need to read a 2 GiB file, you need 2 GiB of RAM.
  671. (More, really, since for at least a moment you’d need to have both the encoded
  672. string and its Unicode version in memory.)</p>
  673. <p>The solution would be to use the low-level decoding interface to catch the case
  674. of partial coding sequences. The work of implementing this has already been
  675. done for you: the built-in <a class="reference internal" href="../library/functions.html#open" title="open"><code class="xref py py-func docutils literal notranslate"><span class="pre">open()</span></code></a> function can return a file-like object
  676. that assumes the file’s contents are in a specified encoding and accepts Unicode
  677. parameters for methods such as <a class="reference internal" href="../library/io.html#io.TextIOBase.read" title="io.TextIOBase.read"><code class="xref py py-meth docutils literal notranslate"><span class="pre">read()</span></code></a> and
  678. <a class="reference internal" href="../library/io.html#io.TextIOBase.write" title="io.TextIOBase.write"><code class="xref py py-meth docutils literal notranslate"><span class="pre">write()</span></code></a>. This works through <a class="reference internal" href="../library/functions.html#open" title="open"><code class="xref py py-func docutils literal notranslate"><span class="pre">open()</span></code></a>'s <em>encoding</em> and
  679. <em>errors</em> parameters which are interpreted just like those in <a class="reference internal" href="../library/stdtypes.html#str.encode" title="str.encode"><code class="xref py py-meth docutils literal notranslate"><span class="pre">str.encode()</span></code></a>
  680. and <a class="reference internal" href="../library/stdtypes.html#bytes.decode" title="bytes.decode"><code class="xref py py-meth docutils literal notranslate"><span class="pre">bytes.decode()</span></code></a>.</p>
  681. <p>Reading Unicode from a file is therefore simple:</p>
  682. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="s1">&#39;unicode.txt&#39;</span><span class="p">,</span> <span class="n">encoding</span><span class="o">=</span><span class="s1">&#39;utf-8&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
  683. <span class="k">for</span> <span class="n">line</span> <span class="ow">in</span> <span class="n">f</span><span class="p">:</span>
  684. <span class="nb">print</span><span class="p">(</span><span class="nb">repr</span><span class="p">(</span><span class="n">line</span><span class="p">))</span>
  685. </pre></div>
  686. </div>
  687. <p>It’s also possible to open files in update mode, allowing both reading and
  688. writing:</p>
  689. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="s1">&#39;test&#39;</span><span class="p">,</span> <span class="n">encoding</span><span class="o">=</span><span class="s1">&#39;utf-8&#39;</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s1">&#39;w+&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
  690. <span class="n">f</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s1">&#39;</span><span class="se">\u4500</span><span class="s1"> blah blah blah</span><span class="se">\n</span><span class="s1">&#39;</span><span class="p">)</span>
  691. <span class="n">f</span><span class="o">.</span><span class="n">seek</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
  692. <span class="nb">print</span><span class="p">(</span><span class="nb">repr</span><span class="p">(</span><span class="n">f</span><span class="o">.</span><span class="n">readline</span><span class="p">()[:</span><span class="mi">1</span><span class="p">]))</span>
  693. </pre></div>
  694. </div>
  695. <p>The Unicode character <code class="docutils literal notranslate"><span class="pre">U+FEFF</span></code> is used as a byte-order mark (BOM), and is often
  696. written as the first character of a file in order to assist with autodetection
  697. of the file’s byte ordering. Some encodings, such as UTF-16, expect a BOM to be
  698. present at the start of a file; when such an encoding is used, the BOM will be
  699. automatically written as the first character and will be silently dropped when
  700. the file is read. There are variants of these encodings, such as ‘utf-16-le’
  701. and ‘utf-16-be’ for little-endian and big-endian encodings, that specify one
  702. particular byte ordering and don’t skip the BOM.</p>
  703. <p>In some areas, it is also convention to use a “BOM” at the start of UTF-8
  704. encoded files; the name is misleading since UTF-8 is not byte-order dependent.
  705. The mark simply announces that the file is encoded in UTF-8. For reading such
  706. files, use the ‘utf-8-sig’ codec to automatically skip the mark if present.</p>
  707. <section id="unicode-filenames">
  708. <h3>Unicode filenames<a class="headerlink" href="#unicode-filenames" title="Permalink to this headline">¶</a></h3>
  709. <p>Most of the operating systems in common use today support filenames
  710. that contain arbitrary Unicode characters. Usually this is
  711. implemented by converting the Unicode string into some encoding that
  712. varies depending on the system. Today Python is converging on using
  713. UTF-8: Python on MacOS has used UTF-8 for several versions, and Python
  714. 3.6 switched to using UTF-8 on Windows as well. On Unix systems,
  715. there will only be a <a class="reference internal" href="../glossary.html#term-filesystem-encoding-and-error-handler"><span class="xref std std-term">filesystem encoding</span></a>. if you’ve set the <code class="docutils literal notranslate"><span class="pre">LANG</span></code> or <code class="docutils literal notranslate"><span class="pre">LC_CTYPE</span></code> environment variables; if
  716. you haven’t, the default encoding is again UTF-8.</p>
  717. <p>The <a class="reference internal" href="../library/sys.html#sys.getfilesystemencoding" title="sys.getfilesystemencoding"><code class="xref py py-func docutils literal notranslate"><span class="pre">sys.getfilesystemencoding()</span></code></a> function returns the encoding to use on
  718. your current system, in case you want to do the encoding manually, but there’s
  719. not much reason to bother. When opening a file for reading or writing, you can
  720. usually just provide the Unicode string as the filename, and it will be
  721. automatically converted to the right encoding for you:</p>
  722. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="n">filename</span> <span class="o">=</span> <span class="s1">&#39;filename</span><span class="se">\u4500</span><span class="s1">abc&#39;</span>
  723. <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">filename</span><span class="p">,</span> <span class="s1">&#39;w&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
  724. <span class="n">f</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="s1">&#39;blah</span><span class="se">\n</span><span class="s1">&#39;</span><span class="p">)</span>
  725. </pre></div>
  726. </div>
  727. <p>Functions in the <a class="reference internal" href="../library/os.html#module-os" title="os: Miscellaneous operating system interfaces."><code class="xref py py-mod docutils literal notranslate"><span class="pre">os</span></code></a> module such as <a class="reference internal" href="../library/os.html#os.stat" title="os.stat"><code class="xref py py-func docutils literal notranslate"><span class="pre">os.stat()</span></code></a> will also accept Unicode
  728. filenames.</p>
  729. <p>The <a class="reference internal" href="../library/os.html#os.listdir" title="os.listdir"><code class="xref py py-func docutils literal notranslate"><span class="pre">os.listdir()</span></code></a> function returns filenames, which raises an issue: should it return
  730. the Unicode version of filenames, or should it return bytes containing
  731. the encoded versions? <a class="reference internal" href="../library/os.html#os.listdir" title="os.listdir"><code class="xref py py-func docutils literal notranslate"><span class="pre">os.listdir()</span></code></a> can do both, depending on whether you
  732. provided the directory path as bytes or a Unicode string. If you pass a
  733. Unicode string as the path, filenames will be decoded using the filesystem’s
  734. encoding and a list of Unicode strings will be returned, while passing a byte
  735. path will return the filenames as bytes. For example,
  736. assuming the default <a class="reference internal" href="../glossary.html#term-filesystem-encoding-and-error-handler"><span class="xref std std-term">filesystem encoding</span></a> is UTF-8, running the following program:</p>
  737. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="n">fn</span> <span class="o">=</span> <span class="s1">&#39;filename</span><span class="se">\u4500</span><span class="s1">abc&#39;</span>
  738. <span class="n">f</span> <span class="o">=</span> <span class="nb">open</span><span class="p">(</span><span class="n">fn</span><span class="p">,</span> <span class="s1">&#39;w&#39;</span><span class="p">)</span>
  739. <span class="n">f</span><span class="o">.</span><span class="n">close</span><span class="p">()</span>
  740. <span class="kn">import</span> <span class="nn">os</span>
  741. <span class="nb">print</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">listdir</span><span class="p">(</span><span class="sa">b</span><span class="s1">&#39;.&#39;</span><span class="p">))</span>
  742. <span class="nb">print</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">listdir</span><span class="p">(</span><span class="s1">&#39;.&#39;</span><span class="p">))</span>
  743. </pre></div>
  744. </div>
  745. <p>will produce the following output:</p>
  746. <div class="highlight-shell-session notranslate"><div class="highlight"><pre><span></span><span class="gp">$ </span>python<span class="w"> </span>listdir-test.py
  747. <span class="go">[b&#39;filename\xe4\x94\x80abc&#39;, ...]</span>
  748. <span class="go">[&#39;filename\u4500abc&#39;, ...]</span>
  749. </pre></div>
  750. </div>
  751. <p>The first list contains UTF-8-encoded filenames, and the second list contains
  752. the Unicode versions.</p>
  753. <p>Note that on most occasions, you should can just stick with using
  754. Unicode with these APIs. The bytes APIs should only be used on
  755. systems where undecodable file names can be present; that’s
  756. pretty much only Unix systems now.</p>
  757. </section>
  758. <section id="tips-for-writing-unicode-aware-programs">
  759. <h3>Tips for Writing Unicode-aware Programs<a class="headerlink" href="#tips-for-writing-unicode-aware-programs" title="Permalink to this headline">¶</a></h3>
  760. <p>This section provides some suggestions on writing software that deals with
  761. Unicode.</p>
  762. <p>The most important tip is:</p>
  763. <blockquote>
  764. <div><p>Software should only work with Unicode strings internally, decoding the input
  765. data as soon as possible and encoding the output only at the end.</p>
  766. </div></blockquote>
  767. <p>If you attempt to write processing functions that accept both Unicode and byte
  768. strings, you will find your program vulnerable to bugs wherever you combine the
  769. two different kinds of strings. There is no automatic encoding or decoding: if
  770. you do e.g. <code class="docutils literal notranslate"><span class="pre">str</span> <span class="pre">+</span> <span class="pre">bytes</span></code>, a <a class="reference internal" href="../library/exceptions.html#TypeError" title="TypeError"><code class="xref py py-exc docutils literal notranslate"><span class="pre">TypeError</span></code></a> will be raised.</p>
  771. <p>When using data coming from a web browser or some other untrusted source, a
  772. common technique is to check for illegal characters in a string before using the
  773. string in a generated command line or storing it in a database. If you’re doing
  774. this, be careful to check the decoded string, not the encoded bytes data;
  775. some encodings may have interesting properties, such as not being bijective
  776. or not being fully ASCII-compatible. This is especially true if the input
  777. data also specifies the encoding, since the attacker can then choose a
  778. clever way to hide malicious text in the encoded bytestream.</p>
  779. <section id="converting-between-file-encodings">
  780. <h4>Converting Between File Encodings<a class="headerlink" href="#converting-between-file-encodings" title="Permalink to this headline">¶</a></h4>
  781. <p>The <a class="reference internal" href="../library/codecs.html#codecs.StreamRecoder" title="codecs.StreamRecoder"><code class="xref py py-class docutils literal notranslate"><span class="pre">StreamRecoder</span></code></a> class can transparently convert between
  782. encodings, taking a stream that returns data in encoding #1
  783. and behaving like a stream returning data in encoding #2.</p>
  784. <p>For example, if you have an input file <em>f</em> that’s in Latin-1, you
  785. can wrap it with a <a class="reference internal" href="../library/codecs.html#codecs.StreamRecoder" title="codecs.StreamRecoder"><code class="xref py py-class docutils literal notranslate"><span class="pre">StreamRecoder</span></code></a> to return bytes encoded in
  786. UTF-8:</p>
  787. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="n">new_f</span> <span class="o">=</span> <span class="n">codecs</span><span class="o">.</span><span class="n">StreamRecoder</span><span class="p">(</span><span class="n">f</span><span class="p">,</span>
  788. <span class="c1"># en/decoder: used by read() to encode its results and</span>
  789. <span class="c1"># by write() to decode its input.</span>
  790. <span class="n">codecs</span><span class="o">.</span><span class="n">getencoder</span><span class="p">(</span><span class="s1">&#39;utf-8&#39;</span><span class="p">),</span> <span class="n">codecs</span><span class="o">.</span><span class="n">getdecoder</span><span class="p">(</span><span class="s1">&#39;utf-8&#39;</span><span class="p">),</span>
  791. <span class="c1"># reader/writer: used to read and write to the stream.</span>
  792. <span class="n">codecs</span><span class="o">.</span><span class="n">getreader</span><span class="p">(</span><span class="s1">&#39;latin-1&#39;</span><span class="p">),</span> <span class="n">codecs</span><span class="o">.</span><span class="n">getwriter</span><span class="p">(</span><span class="s1">&#39;latin-1&#39;</span><span class="p">)</span> <span class="p">)</span>
  793. </pre></div>
  794. </div>
  795. </section>
  796. <section id="files-in-an-unknown-encoding">
  797. <h4>Files in an Unknown Encoding<a class="headerlink" href="#files-in-an-unknown-encoding" title="Permalink to this headline">¶</a></h4>
  798. <p>What can you do if you need to make a change to a file, but don’t know
  799. the file’s encoding? If you know the encoding is ASCII-compatible and
  800. only want to examine or modify the ASCII parts, you can open the file
  801. with the <code class="docutils literal notranslate"><span class="pre">surrogateescape</span></code> error handler:</p>
  802. <div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">fname</span><span class="p">,</span> <span class="s1">&#39;r&#39;</span><span class="p">,</span> <span class="n">encoding</span><span class="o">=</span><span class="s2">&quot;ascii&quot;</span><span class="p">,</span> <span class="n">errors</span><span class="o">=</span><span class="s2">&quot;surrogateescape&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
  803. <span class="n">data</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="n">read</span><span class="p">()</span>
  804. <span class="c1"># make changes to the string &#39;data&#39;</span>
  805. <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">fname</span> <span class="o">+</span> <span class="s1">&#39;.new&#39;</span><span class="p">,</span> <span class="s1">&#39;w&#39;</span><span class="p">,</span>
  806. <span class="n">encoding</span><span class="o">=</span><span class="s2">&quot;ascii&quot;</span><span class="p">,</span> <span class="n">errors</span><span class="o">=</span><span class="s2">&quot;surrogateescape&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
  807. <span class="n">f</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
  808. </pre></div>
  809. </div>
  810. <p>The <code class="docutils literal notranslate"><span class="pre">surrogateescape</span></code> error handler will decode any non-ASCII bytes
  811. as code points in a special range running from U+DC80 to
  812. U+DCFF. These code points will then turn back into the
  813. same bytes when the <code class="docutils literal notranslate"><span class="pre">surrogateescape</span></code> error handler is used to
  814. encode the data and write it back out.</p>
  815. </section>
  816. </section>
  817. <section id="id3">
  818. <h3>References<a class="headerlink" href="#id3" title="Permalink to this headline">¶</a></h3>
  819. <p>One section of <a class="reference external" href="https://pyvideo.org/video/289/pycon-2010--mastering-python-3-i-o">Mastering Python 3 Input/Output</a>,
  820. a PyCon 2010 talk by David Beazley, discusses text processing and binary data handling.</p>
  821. <p>The <a class="reference external" href="https://downloads.egenix.com/python/LSM2005-Developing-Unicode-aware-applications-in-Python.pdf">PDF slides for Marc-André Lemburg’s presentation “Writing Unicode-aware
  822. Applications in Python”</a>
  823. discuss questions of character encodings as well as how to internationalize
  824. and localize an application. These slides cover Python 2.x only.</p>
  825. <p><a class="reference external" href="https://pyvideo.org/video/1768/the-guts-of-unicode-in-python">The Guts of Unicode in Python</a>
  826. is a PyCon 2013 talk by Benjamin Peterson that discusses the internal Unicode
  827. representation in Python 3.3.</p>
  828. </section>
  829. </section>
  830. <section id="acknowledgements">
  831. <h2>Acknowledgements<a class="headerlink" href="#acknowledgements" title="Permalink to this headline">¶</a></h2>
  832. <p>The initial draft of this document was written by Andrew Kuchling.
  833. It has since been revised further by Alexander Belopolsky, Georg Brandl,
  834. Andrew Kuchling, and Ezio Melotti.</p>
  835. <p>Thanks to the following people who have noted errors or offered
  836. suggestions on this article: Éric Araujo, Nicholas Bastin, Nick
  837. Coghlan, Marius Gedminas, Kent Johnson, Ken Krugler, Marc-André
  838. Lemburg, Martin von Löwis, Terry J. Reedy, Serhiy Storchaka,
  839. Eryk Sun, Chad Whitacre, Graham Wideman.</p>
  840. </section>
  841. </section>
  842. <div class="clearer"></div>
  843. </div>
  844. </div>
  845. </div>
  846. <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
  847. <div class="sphinxsidebarwrapper">
  848. <div>
  849. <h3><a href="../contents.html">Table of Contents</a></h3>
  850. <ul>
  851. <li><a class="reference internal" href="#">Unicode HOWTO</a><ul>
  852. <li><a class="reference internal" href="#introduction-to-unicode">Introduction to Unicode</a><ul>
  853. <li><a class="reference internal" href="#definitions">Definitions</a></li>
  854. <li><a class="reference internal" href="#encodings">Encodings</a></li>
  855. <li><a class="reference internal" href="#references">References</a></li>
  856. </ul>
  857. </li>
  858. <li><a class="reference internal" href="#python-s-unicode-support">Python’s Unicode Support</a><ul>
  859. <li><a class="reference internal" href="#the-string-type">The String Type</a></li>
  860. <li><a class="reference internal" href="#converting-to-bytes">Converting to Bytes</a></li>
  861. <li><a class="reference internal" href="#unicode-literals-in-python-source-code">Unicode Literals in Python Source Code</a></li>
  862. <li><a class="reference internal" href="#unicode-properties">Unicode Properties</a></li>
  863. <li><a class="reference internal" href="#comparing-strings">Comparing Strings</a></li>
  864. <li><a class="reference internal" href="#unicode-regular-expressions">Unicode Regular Expressions</a></li>
  865. <li><a class="reference internal" href="#id2">References</a></li>
  866. </ul>
  867. </li>
  868. <li><a class="reference internal" href="#reading-and-writing-unicode-data">Reading and Writing Unicode Data</a><ul>
  869. <li><a class="reference internal" href="#unicode-filenames">Unicode filenames</a></li>
  870. <li><a class="reference internal" href="#tips-for-writing-unicode-aware-programs">Tips for Writing Unicode-aware Programs</a><ul>
  871. <li><a class="reference internal" href="#converting-between-file-encodings">Converting Between File Encodings</a></li>
  872. <li><a class="reference internal" href="#files-in-an-unknown-encoding">Files in an Unknown Encoding</a></li>
  873. </ul>
  874. </li>
  875. <li><a class="reference internal" href="#id3">References</a></li>
  876. </ul>
  877. </li>
  878. <li><a class="reference internal" href="#acknowledgements">Acknowledgements</a></li>
  879. </ul>
  880. </li>
  881. </ul>
  882. </div>
  883. <div>
  884. <h4>Previous topic</h4>
  885. <p class="topless"><a href="sorting.html"
  886. title="previous chapter">Sorting HOW TO</a></p>
  887. </div>
  888. <div>
  889. <h4>Next topic</h4>
  890. <p class="topless"><a href="urllib2.html"
  891. title="next chapter">HOWTO Fetch Internet Resources Using The urllib Package</a></p>
  892. </div>
  893. <div role="note" aria-label="source link">
  894. <h3>This Page</h3>
  895. <ul class="this-page-menu">
  896. <li><a href="../bugs.html">Report a Bug</a></li>
  897. <li>
  898. <a href="https://github.com/python/cpython/blob/main/Doc/howto/unicode.rst"
  899. rel="nofollow">Show Source
  900. </a>
  901. </li>
  902. </ul>
  903. </div>
  904. </div>
  905. </div>
  906. <div class="clearer"></div>
  907. </div>
  908. <div class="related" role="navigation" aria-label="related navigation">
  909. <h3>Navigation</h3>
  910. <ul>
  911. <li class="right" style="margin-right: 10px">
  912. <a href="../genindex.html" title="General Index"
  913. >index</a></li>
  914. <li class="right" >
  915. <a href="../py-modindex.html" title="Python Module Index"
  916. >modules</a> |</li>
  917. <li class="right" >
  918. <a href="urllib2.html" title="HOWTO Fetch Internet Resources Using The urllib Package"
  919. >next</a> |</li>
  920. <li class="right" >
  921. <a href="sorting.html" title="Sorting HOW TO"
  922. >previous</a> |</li>
  923. <li><img src="../_static/py.svg" alt="python logo" style="vertical-align: middle; margin-top: -1px"/></li>
  924. <li><a href="https://www.python.org/">Python</a> &#187;</li>
  925. <li class="switchers">
  926. <div class="language_switcher_placeholder"></div>
  927. <div class="version_switcher_placeholder"></div>
  928. </li>
  929. <li>
  930. </li>
  931. <li id="cpython-language-and-version">
  932. <a href="../index.html">3.12.0 Documentation</a> &#187;
  933. </li>
  934. <li class="nav-item nav-item-1"><a href="index.html" >Python HOWTOs</a> &#187;</li>
  935. <li class="nav-item nav-item-this"><a href="">Unicode HOWTO</a></li>
  936. <li class="right">
  937. <div class="inline-search" role="search">
  938. <form class="inline-search" action="../search.html" method="get">
  939. <input placeholder="Quick search" aria-label="Quick search" type="search" name="q" />
  940. <input type="submit" value="Go" />
  941. </form>
  942. </div>
  943. |
  944. </li>
  945. <li class="right">
  946. <label class="theme-selector-label">
  947. Theme
  948. <select class="theme-selector" oninput="activateTheme(this.value)">
  949. <option value="auto" selected>Auto</option>
  950. <option value="light">Light</option>
  951. <option value="dark">Dark</option>
  952. </select>
  953. </label> |</li>
  954. </ul>
  955. </div>
  956. <div class="footer">
  957. &copy; <a href="../copyright.html">Copyright</a> 2001-2023, Python Software Foundation.
  958. <br />
  959. This page is licensed under the Python Software Foundation License Version 2.
  960. <br />
  961. Examples, recipes, and other code in the documentation are additionally licensed under the Zero Clause BSD License.
  962. <br />
  963. See <a href="/license.html">History and License</a> for more information.<br />
  964. <br />
  965. The Python Software Foundation is a non-profit corporation.
  966. <a href="https://www.python.org/psf/donations/">Please donate.</a>
  967. <br />
  968. <br />
  969. Last updated on Oct 02, 2023.
  970. <a href="/bugs.html">Found a bug</a>?
  971. <br />
  972. Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0.
  973. </div>
  974. </body>
  975. </html>