_parser.py 57 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613
  1. # -*- coding: utf-8 -*-
  2. """
  3. This module offers a generic date/time string parser which is able to parse
  4. most known formats to represent a date and/or time.
  5. This module attempts to be forgiving with regards to unlikely input formats,
  6. returning a datetime object even for dates which are ambiguous. If an element
  7. of a date/time stamp is omitted, the following rules are applied:
  8. - If AM or PM is left unspecified, a 24-hour clock is assumed, however, an hour
  9. on a 12-hour clock (``0 <= hour <= 12``) *must* be specified if AM or PM is
  10. specified.
  11. - If a time zone is omitted, a timezone-naive datetime is returned.
  12. If any other elements are missing, they are taken from the
  13. :class:`datetime.datetime` object passed to the parameter ``default``. If this
  14. results in a day number exceeding the valid number of days per month, the
  15. value falls back to the end of the month.
  16. Additional resources about date/time string formats can be found below:
  17. - `A summary of the international standard date and time notation
  18. <https://www.cl.cam.ac.uk/~mgk25/iso-time.html>`_
  19. - `W3C Date and Time Formats <https://www.w3.org/TR/NOTE-datetime>`_
  20. - `Time Formats (Planetary Rings Node) <https://pds-rings.seti.org:443/tools/time_formats.html>`_
  21. - `CPAN ParseDate module
  22. <https://metacpan.org/pod/release/MUIR/Time-modules-2013.0912/lib/Time/ParseDate.pm>`_
  23. - `Java SimpleDateFormat Class
  24. <https://docs.oracle.com/javase/6/docs/api/java/text/SimpleDateFormat.html>`_
  25. """
  26. from __future__ import unicode_literals
  27. import datetime
  28. import re
  29. import string
  30. import time
  31. import warnings
  32. from calendar import monthrange
  33. from io import StringIO
  34. import six
  35. from six import integer_types, text_type
  36. from decimal import Decimal
  37. from warnings import warn
  38. from .. import relativedelta
  39. from .. import tz
  40. __all__ = ["parse", "parserinfo", "ParserError"]
  41. # TODO: pandas.core.tools.datetimes imports this explicitly. Might be worth
  42. # making public and/or figuring out if there is something we can
  43. # take off their plate.
  44. class _timelex(object):
  45. # Fractional seconds are sometimes split by a comma
  46. _split_decimal = re.compile("([.,])")
  47. def __init__(self, instream):
  48. if isinstance(instream, (bytes, bytearray)):
  49. instream = instream.decode()
  50. if isinstance(instream, text_type):
  51. instream = StringIO(instream)
  52. elif getattr(instream, 'read', None) is None:
  53. raise TypeError('Parser must be a string or character stream, not '
  54. '{itype}'.format(itype=instream.__class__.__name__))
  55. self.instream = instream
  56. self.charstack = []
  57. self.tokenstack = []
  58. self.eof = False
  59. def get_token(self):
  60. """
  61. This function breaks the time string into lexical units (tokens), which
  62. can be parsed by the parser. Lexical units are demarcated by changes in
  63. the character set, so any continuous string of letters is considered
  64. one unit, any continuous string of numbers is considered one unit.
  65. The main complication arises from the fact that dots ('.') can be used
  66. both as separators (e.g. "Sep.20.2009") or decimal points (e.g.
  67. "4:30:21.447"). As such, it is necessary to read the full context of
  68. any dot-separated strings before breaking it into tokens; as such, this
  69. function maintains a "token stack", for when the ambiguous context
  70. demands that multiple tokens be parsed at once.
  71. """
  72. if self.tokenstack:
  73. return self.tokenstack.pop(0)
  74. seenletters = False
  75. token = None
  76. state = None
  77. while not self.eof:
  78. # We only realize that we've reached the end of a token when we
  79. # find a character that's not part of the current token - since
  80. # that character may be part of the next token, it's stored in the
  81. # charstack.
  82. if self.charstack:
  83. nextchar = self.charstack.pop(0)
  84. else:
  85. nextchar = self.instream.read(1)
  86. while nextchar == '\x00':
  87. nextchar = self.instream.read(1)
  88. if not nextchar:
  89. self.eof = True
  90. break
  91. elif not state:
  92. # First character of the token - determines if we're starting
  93. # to parse a word, a number or something else.
  94. token = nextchar
  95. if self.isword(nextchar):
  96. state = 'a'
  97. elif self.isnum(nextchar):
  98. state = '0'
  99. elif self.isspace(nextchar):
  100. token = ' '
  101. break # emit token
  102. else:
  103. break # emit token
  104. elif state == 'a':
  105. # If we've already started reading a word, we keep reading
  106. # letters until we find something that's not part of a word.
  107. seenletters = True
  108. if self.isword(nextchar):
  109. token += nextchar
  110. elif nextchar == '.':
  111. token += nextchar
  112. state = 'a.'
  113. else:
  114. self.charstack.append(nextchar)
  115. break # emit token
  116. elif state == '0':
  117. # If we've already started reading a number, we keep reading
  118. # numbers until we find something that doesn't fit.
  119. if self.isnum(nextchar):
  120. token += nextchar
  121. elif nextchar == '.' or (nextchar == ',' and len(token) >= 2):
  122. token += nextchar
  123. state = '0.'
  124. else:
  125. self.charstack.append(nextchar)
  126. break # emit token
  127. elif state == 'a.':
  128. # If we've seen some letters and a dot separator, continue
  129. # parsing, and the tokens will be broken up later.
  130. seenletters = True
  131. if nextchar == '.' or self.isword(nextchar):
  132. token += nextchar
  133. elif self.isnum(nextchar) and token[-1] == '.':
  134. token += nextchar
  135. state = '0.'
  136. else:
  137. self.charstack.append(nextchar)
  138. break # emit token
  139. elif state == '0.':
  140. # If we've seen at least one dot separator, keep going, we'll
  141. # break up the tokens later.
  142. if nextchar == '.' or self.isnum(nextchar):
  143. token += nextchar
  144. elif self.isword(nextchar) and token[-1] == '.':
  145. token += nextchar
  146. state = 'a.'
  147. else:
  148. self.charstack.append(nextchar)
  149. break # emit token
  150. if (state in ('a.', '0.') and (seenletters or token.count('.') > 1 or
  151. token[-1] in '.,')):
  152. l = self._split_decimal.split(token)
  153. token = l[0]
  154. for tok in l[1:]:
  155. if tok:
  156. self.tokenstack.append(tok)
  157. if state == '0.' and token.count('.') == 0:
  158. token = token.replace(',', '.')
  159. return token
  160. def __iter__(self):
  161. return self
  162. def __next__(self):
  163. token = self.get_token()
  164. if token is None:
  165. raise StopIteration
  166. return token
  167. def next(self):
  168. return self.__next__() # Python 2.x support
  169. @classmethod
  170. def split(cls, s):
  171. return list(cls(s))
  172. @classmethod
  173. def isword(cls, nextchar):
  174. """ Whether or not the next character is part of a word """
  175. return nextchar.isalpha()
  176. @classmethod
  177. def isnum(cls, nextchar):
  178. """ Whether the next character is part of a number """
  179. return nextchar.isdigit()
  180. @classmethod
  181. def isspace(cls, nextchar):
  182. """ Whether the next character is whitespace """
  183. return nextchar.isspace()
  184. class _resultbase(object):
  185. def __init__(self):
  186. for attr in self.__slots__:
  187. setattr(self, attr, None)
  188. def _repr(self, classname):
  189. l = []
  190. for attr in self.__slots__:
  191. value = getattr(self, attr)
  192. if value is not None:
  193. l.append("%s=%s" % (attr, repr(value)))
  194. return "%s(%s)" % (classname, ", ".join(l))
  195. def __len__(self):
  196. return (sum(getattr(self, attr) is not None
  197. for attr in self.__slots__))
  198. def __repr__(self):
  199. return self._repr(self.__class__.__name__)
  200. class parserinfo(object):
  201. """
  202. Class which handles what inputs are accepted. Subclass this to customize
  203. the language and acceptable values for each parameter.
  204. :param dayfirst:
  205. Whether to interpret the first value in an ambiguous 3-integer date
  206. (e.g. 01/05/09) as the day (``True``) or month (``False``). If
  207. ``yearfirst`` is set to ``True``, this distinguishes between YDM
  208. and YMD. Default is ``False``.
  209. :param yearfirst:
  210. Whether to interpret the first value in an ambiguous 3-integer date
  211. (e.g. 01/05/09) as the year. If ``True``, the first number is taken
  212. to be the year, otherwise the last number is taken to be the year.
  213. Default is ``False``.
  214. """
  215. # m from a.m/p.m, t from ISO T separator
  216. JUMP = [" ", ".", ",", ";", "-", "/", "'",
  217. "at", "on", "and", "ad", "m", "t", "of",
  218. "st", "nd", "rd", "th"]
  219. WEEKDAYS = [("Mon", "Monday"),
  220. ("Tue", "Tuesday"), # TODO: "Tues"
  221. ("Wed", "Wednesday"),
  222. ("Thu", "Thursday"), # TODO: "Thurs"
  223. ("Fri", "Friday"),
  224. ("Sat", "Saturday"),
  225. ("Sun", "Sunday")]
  226. MONTHS = [("Jan", "January"),
  227. ("Feb", "February"), # TODO: "Febr"
  228. ("Mar", "March"),
  229. ("Apr", "April"),
  230. ("May", "May"),
  231. ("Jun", "June"),
  232. ("Jul", "July"),
  233. ("Aug", "August"),
  234. ("Sep", "Sept", "September"),
  235. ("Oct", "October"),
  236. ("Nov", "November"),
  237. ("Dec", "December")]
  238. HMS = [("h", "hour", "hours"),
  239. ("m", "minute", "minutes"),
  240. ("s", "second", "seconds")]
  241. AMPM = [("am", "a"),
  242. ("pm", "p")]
  243. UTCZONE = ["UTC", "GMT", "Z", "z"]
  244. PERTAIN = ["of"]
  245. TZOFFSET = {}
  246. # TODO: ERA = ["AD", "BC", "CE", "BCE", "Stardate",
  247. # "Anno Domini", "Year of Our Lord"]
  248. def __init__(self, dayfirst=False, yearfirst=False):
  249. self._jump = self._convert(self.JUMP)
  250. self._weekdays = self._convert(self.WEEKDAYS)
  251. self._months = self._convert(self.MONTHS)
  252. self._hms = self._convert(self.HMS)
  253. self._ampm = self._convert(self.AMPM)
  254. self._utczone = self._convert(self.UTCZONE)
  255. self._pertain = self._convert(self.PERTAIN)
  256. self.dayfirst = dayfirst
  257. self.yearfirst = yearfirst
  258. self._year = time.localtime().tm_year
  259. self._century = self._year // 100 * 100
  260. def _convert(self, lst):
  261. dct = {}
  262. for i, v in enumerate(lst):
  263. if isinstance(v, tuple):
  264. for v in v:
  265. dct[v.lower()] = i
  266. else:
  267. dct[v.lower()] = i
  268. return dct
  269. def jump(self, name):
  270. return name.lower() in self._jump
  271. def weekday(self, name):
  272. try:
  273. return self._weekdays[name.lower()]
  274. except KeyError:
  275. pass
  276. return None
  277. def month(self, name):
  278. try:
  279. return self._months[name.lower()] + 1
  280. except KeyError:
  281. pass
  282. return None
  283. def hms(self, name):
  284. try:
  285. return self._hms[name.lower()]
  286. except KeyError:
  287. return None
  288. def ampm(self, name):
  289. try:
  290. return self._ampm[name.lower()]
  291. except KeyError:
  292. return None
  293. def pertain(self, name):
  294. return name.lower() in self._pertain
  295. def utczone(self, name):
  296. return name.lower() in self._utczone
  297. def tzoffset(self, name):
  298. if name in self._utczone:
  299. return 0
  300. return self.TZOFFSET.get(name)
  301. def convertyear(self, year, century_specified=False):
  302. """
  303. Converts two-digit years to year within [-50, 49]
  304. range of self._year (current local time)
  305. """
  306. # Function contract is that the year is always positive
  307. assert year >= 0
  308. if year < 100 and not century_specified:
  309. # assume current century to start
  310. year += self._century
  311. if year >= self._year + 50: # if too far in future
  312. year -= 100
  313. elif year < self._year - 50: # if too far in past
  314. year += 100
  315. return year
  316. def validate(self, res):
  317. # move to info
  318. if res.year is not None:
  319. res.year = self.convertyear(res.year, res.century_specified)
  320. if ((res.tzoffset == 0 and not res.tzname) or
  321. (res.tzname == 'Z' or res.tzname == 'z')):
  322. res.tzname = "UTC"
  323. res.tzoffset = 0
  324. elif res.tzoffset != 0 and res.tzname and self.utczone(res.tzname):
  325. res.tzoffset = 0
  326. return True
  327. class _ymd(list):
  328. def __init__(self, *args, **kwargs):
  329. super(self.__class__, self).__init__(*args, **kwargs)
  330. self.century_specified = False
  331. self.dstridx = None
  332. self.mstridx = None
  333. self.ystridx = None
  334. @property
  335. def has_year(self):
  336. return self.ystridx is not None
  337. @property
  338. def has_month(self):
  339. return self.mstridx is not None
  340. @property
  341. def has_day(self):
  342. return self.dstridx is not None
  343. def could_be_day(self, value):
  344. if self.has_day:
  345. return False
  346. elif not self.has_month:
  347. return 1 <= value <= 31
  348. elif not self.has_year:
  349. # Be permissive, assume leap year
  350. month = self[self.mstridx]
  351. return 1 <= value <= monthrange(2000, month)[1]
  352. else:
  353. month = self[self.mstridx]
  354. year = self[self.ystridx]
  355. return 1 <= value <= monthrange(year, month)[1]
  356. def append(self, val, label=None):
  357. if hasattr(val, '__len__'):
  358. if val.isdigit() and len(val) > 2:
  359. self.century_specified = True
  360. if label not in [None, 'Y']: # pragma: no cover
  361. raise ValueError(label)
  362. label = 'Y'
  363. elif val > 100:
  364. self.century_specified = True
  365. if label not in [None, 'Y']: # pragma: no cover
  366. raise ValueError(label)
  367. label = 'Y'
  368. super(self.__class__, self).append(int(val))
  369. if label == 'M':
  370. if self.has_month:
  371. raise ValueError('Month is already set')
  372. self.mstridx = len(self) - 1
  373. elif label == 'D':
  374. if self.has_day:
  375. raise ValueError('Day is already set')
  376. self.dstridx = len(self) - 1
  377. elif label == 'Y':
  378. if self.has_year:
  379. raise ValueError('Year is already set')
  380. self.ystridx = len(self) - 1
  381. def _resolve_from_stridxs(self, strids):
  382. """
  383. Try to resolve the identities of year/month/day elements using
  384. ystridx, mstridx, and dstridx, if enough of these are specified.
  385. """
  386. if len(self) == 3 and len(strids) == 2:
  387. # we can back out the remaining stridx value
  388. missing = [x for x in range(3) if x not in strids.values()]
  389. key = [x for x in ['y', 'm', 'd'] if x not in strids]
  390. assert len(missing) == len(key) == 1
  391. key = key[0]
  392. val = missing[0]
  393. strids[key] = val
  394. assert len(self) == len(strids) # otherwise this should not be called
  395. out = {key: self[strids[key]] for key in strids}
  396. return (out.get('y'), out.get('m'), out.get('d'))
  397. def resolve_ymd(self, yearfirst, dayfirst):
  398. len_ymd = len(self)
  399. year, month, day = (None, None, None)
  400. strids = (('y', self.ystridx),
  401. ('m', self.mstridx),
  402. ('d', self.dstridx))
  403. strids = {key: val for key, val in strids if val is not None}
  404. if (len(self) == len(strids) > 0 or
  405. (len(self) == 3 and len(strids) == 2)):
  406. return self._resolve_from_stridxs(strids)
  407. mstridx = self.mstridx
  408. if len_ymd > 3:
  409. raise ValueError("More than three YMD values")
  410. elif len_ymd == 1 or (mstridx is not None and len_ymd == 2):
  411. # One member, or two members with a month string
  412. if mstridx is not None:
  413. month = self[mstridx]
  414. # since mstridx is 0 or 1, self[mstridx-1] always
  415. # looks up the other element
  416. other = self[mstridx - 1]
  417. else:
  418. other = self[0]
  419. if len_ymd > 1 or mstridx is None:
  420. if other > 31:
  421. year = other
  422. else:
  423. day = other
  424. elif len_ymd == 2:
  425. # Two members with numbers
  426. if self[0] > 31:
  427. # 99-01
  428. year, month = self
  429. elif self[1] > 31:
  430. # 01-99
  431. month, year = self
  432. elif dayfirst and self[1] <= 12:
  433. # 13-01
  434. day, month = self
  435. else:
  436. # 01-13
  437. month, day = self
  438. elif len_ymd == 3:
  439. # Three members
  440. if mstridx == 0:
  441. if self[1] > 31:
  442. # Apr-2003-25
  443. month, year, day = self
  444. else:
  445. month, day, year = self
  446. elif mstridx == 1:
  447. if self[0] > 31 or (yearfirst and self[2] <= 31):
  448. # 99-Jan-01
  449. year, month, day = self
  450. else:
  451. # 01-Jan-01
  452. # Give precedence to day-first, since
  453. # two-digit years is usually hand-written.
  454. day, month, year = self
  455. elif mstridx == 2:
  456. # WTF!?
  457. if self[1] > 31:
  458. # 01-99-Jan
  459. day, year, month = self
  460. else:
  461. # 99-01-Jan
  462. year, day, month = self
  463. else:
  464. if (self[0] > 31 or
  465. self.ystridx == 0 or
  466. (yearfirst and self[1] <= 12 and self[2] <= 31)):
  467. # 99-01-01
  468. if dayfirst and self[2] <= 12:
  469. year, day, month = self
  470. else:
  471. year, month, day = self
  472. elif self[0] > 12 or (dayfirst and self[1] <= 12):
  473. # 13-01-01
  474. day, month, year = self
  475. else:
  476. # 01-13-01
  477. month, day, year = self
  478. return year, month, day
  479. class parser(object):
  480. def __init__(self, info=None):
  481. self.info = info or parserinfo()
  482. def parse(self, timestr, default=None,
  483. ignoretz=False, tzinfos=None, **kwargs):
  484. """
  485. Parse the date/time string into a :class:`datetime.datetime` object.
  486. :param timestr:
  487. Any date/time string using the supported formats.
  488. :param default:
  489. The default datetime object, if this is a datetime object and not
  490. ``None``, elements specified in ``timestr`` replace elements in the
  491. default object.
  492. :param ignoretz:
  493. If set ``True``, time zones in parsed strings are ignored and a
  494. naive :class:`datetime.datetime` object is returned.
  495. :param tzinfos:
  496. Additional time zone names / aliases which may be present in the
  497. string. This argument maps time zone names (and optionally offsets
  498. from those time zones) to time zones. This parameter can be a
  499. dictionary with timezone aliases mapping time zone names to time
  500. zones or a function taking two parameters (``tzname`` and
  501. ``tzoffset``) and returning a time zone.
  502. The timezones to which the names are mapped can be an integer
  503. offset from UTC in seconds or a :class:`tzinfo` object.
  504. .. doctest::
  505. :options: +NORMALIZE_WHITESPACE
  506. >>> from dateutil.parser import parse
  507. >>> from dateutil.tz import gettz
  508. >>> tzinfos = {"BRST": -7200, "CST": gettz("America/Chicago")}
  509. >>> parse("2012-01-19 17:21:00 BRST", tzinfos=tzinfos)
  510. datetime.datetime(2012, 1, 19, 17, 21, tzinfo=tzoffset(u'BRST', -7200))
  511. >>> parse("2012-01-19 17:21:00 CST", tzinfos=tzinfos)
  512. datetime.datetime(2012, 1, 19, 17, 21,
  513. tzinfo=tzfile('/usr/share/zoneinfo/America/Chicago'))
  514. This parameter is ignored if ``ignoretz`` is set.
  515. :param \\*\\*kwargs:
  516. Keyword arguments as passed to ``_parse()``.
  517. :return:
  518. Returns a :class:`datetime.datetime` object or, if the
  519. ``fuzzy_with_tokens`` option is ``True``, returns a tuple, the
  520. first element being a :class:`datetime.datetime` object, the second
  521. a tuple containing the fuzzy tokens.
  522. :raises ParserError:
  523. Raised for invalid or unknown string format, if the provided
  524. :class:`tzinfo` is not in a valid format, or if an invalid date
  525. would be created.
  526. :raises TypeError:
  527. Raised for non-string or character stream input.
  528. :raises OverflowError:
  529. Raised if the parsed date exceeds the largest valid C integer on
  530. your system.
  531. """
  532. if default is None:
  533. default = datetime.datetime.now().replace(hour=0, minute=0,
  534. second=0, microsecond=0)
  535. res, skipped_tokens = self._parse(timestr, **kwargs)
  536. if res is None:
  537. raise ParserError("Unknown string format: %s", timestr)
  538. if len(res) == 0:
  539. raise ParserError("String does not contain a date: %s", timestr)
  540. try:
  541. ret = self._build_naive(res, default)
  542. except ValueError as e:
  543. six.raise_from(ParserError(str(e) + ": %s", timestr), e)
  544. if not ignoretz:
  545. ret = self._build_tzaware(ret, res, tzinfos)
  546. if kwargs.get('fuzzy_with_tokens', False):
  547. return ret, skipped_tokens
  548. else:
  549. return ret
  550. class _result(_resultbase):
  551. __slots__ = ["year", "month", "day", "weekday",
  552. "hour", "minute", "second", "microsecond",
  553. "tzname", "tzoffset", "ampm","any_unused_tokens"]
  554. def _parse(self, timestr, dayfirst=None, yearfirst=None, fuzzy=False,
  555. fuzzy_with_tokens=False):
  556. """
  557. Private method which performs the heavy lifting of parsing, called from
  558. ``parse()``, which passes on its ``kwargs`` to this function.
  559. :param timestr:
  560. The string to parse.
  561. :param dayfirst:
  562. Whether to interpret the first value in an ambiguous 3-integer date
  563. (e.g. 01/05/09) as the day (``True``) or month (``False``). If
  564. ``yearfirst`` is set to ``True``, this distinguishes between YDM
  565. and YMD. If set to ``None``, this value is retrieved from the
  566. current :class:`parserinfo` object (which itself defaults to
  567. ``False``).
  568. :param yearfirst:
  569. Whether to interpret the first value in an ambiguous 3-integer date
  570. (e.g. 01/05/09) as the year. If ``True``, the first number is taken
  571. to be the year, otherwise the last number is taken to be the year.
  572. If this is set to ``None``, the value is retrieved from the current
  573. :class:`parserinfo` object (which itself defaults to ``False``).
  574. :param fuzzy:
  575. Whether to allow fuzzy parsing, allowing for string like "Today is
  576. January 1, 2047 at 8:21:00AM".
  577. :param fuzzy_with_tokens:
  578. If ``True``, ``fuzzy`` is automatically set to True, and the parser
  579. will return a tuple where the first element is the parsed
  580. :class:`datetime.datetime` datetimestamp and the second element is
  581. a tuple containing the portions of the string which were ignored:
  582. .. doctest::
  583. >>> from dateutil.parser import parse
  584. >>> parse("Today is January 1, 2047 at 8:21:00AM", fuzzy_with_tokens=True)
  585. (datetime.datetime(2047, 1, 1, 8, 21), (u'Today is ', u' ', u'at '))
  586. """
  587. if fuzzy_with_tokens:
  588. fuzzy = True
  589. info = self.info
  590. if dayfirst is None:
  591. dayfirst = info.dayfirst
  592. if yearfirst is None:
  593. yearfirst = info.yearfirst
  594. res = self._result()
  595. l = _timelex.split(timestr) # Splits the timestr into tokens
  596. skipped_idxs = []
  597. # year/month/day list
  598. ymd = _ymd()
  599. len_l = len(l)
  600. i = 0
  601. try:
  602. while i < len_l:
  603. # Check if it's a number
  604. value_repr = l[i]
  605. try:
  606. value = float(value_repr)
  607. except ValueError:
  608. value = None
  609. if value is not None:
  610. # Numeric token
  611. i = self._parse_numeric_token(l, i, info, ymd, res, fuzzy)
  612. # Check weekday
  613. elif info.weekday(l[i]) is not None:
  614. value = info.weekday(l[i])
  615. res.weekday = value
  616. # Check month name
  617. elif info.month(l[i]) is not None:
  618. value = info.month(l[i])
  619. ymd.append(value, 'M')
  620. if i + 1 < len_l:
  621. if l[i + 1] in ('-', '/'):
  622. # Jan-01[-99]
  623. sep = l[i + 1]
  624. ymd.append(l[i + 2])
  625. if i + 3 < len_l and l[i + 3] == sep:
  626. # Jan-01-99
  627. ymd.append(l[i + 4])
  628. i += 2
  629. i += 2
  630. elif (i + 4 < len_l and l[i + 1] == l[i + 3] == ' ' and
  631. info.pertain(l[i + 2])):
  632. # Jan of 01
  633. # In this case, 01 is clearly year
  634. if l[i + 4].isdigit():
  635. # Convert it here to become unambiguous
  636. value = int(l[i + 4])
  637. year = str(info.convertyear(value))
  638. ymd.append(year, 'Y')
  639. else:
  640. # Wrong guess
  641. pass
  642. # TODO: not hit in tests
  643. i += 4
  644. # Check am/pm
  645. elif info.ampm(l[i]) is not None:
  646. value = info.ampm(l[i])
  647. val_is_ampm = self._ampm_valid(res.hour, res.ampm, fuzzy)
  648. if val_is_ampm:
  649. res.hour = self._adjust_ampm(res.hour, value)
  650. res.ampm = value
  651. elif fuzzy:
  652. skipped_idxs.append(i)
  653. # Check for a timezone name
  654. elif self._could_be_tzname(res.hour, res.tzname, res.tzoffset, l[i]):
  655. res.tzname = l[i]
  656. res.tzoffset = info.tzoffset(res.tzname)
  657. # Check for something like GMT+3, or BRST+3. Notice
  658. # that it doesn't mean "I am 3 hours after GMT", but
  659. # "my time +3 is GMT". If found, we reverse the
  660. # logic so that timezone parsing code will get it
  661. # right.
  662. if i + 1 < len_l and l[i + 1] in ('+', '-'):
  663. l[i + 1] = ('+', '-')[l[i + 1] == '+']
  664. res.tzoffset = None
  665. if info.utczone(res.tzname):
  666. # With something like GMT+3, the timezone
  667. # is *not* GMT.
  668. res.tzname = None
  669. # Check for a numbered timezone
  670. elif res.hour is not None and l[i] in ('+', '-'):
  671. signal = (-1, 1)[l[i] == '+']
  672. len_li = len(l[i + 1])
  673. # TODO: check that l[i + 1] is integer?
  674. if len_li == 4:
  675. # -0300
  676. hour_offset = int(l[i + 1][:2])
  677. min_offset = int(l[i + 1][2:])
  678. elif i + 2 < len_l and l[i + 2] == ':':
  679. # -03:00
  680. hour_offset = int(l[i + 1])
  681. min_offset = int(l[i + 3]) # TODO: Check that l[i+3] is minute-like?
  682. i += 2
  683. elif len_li <= 2:
  684. # -[0]3
  685. hour_offset = int(l[i + 1][:2])
  686. min_offset = 0
  687. else:
  688. raise ValueError(timestr)
  689. res.tzoffset = signal * (hour_offset * 3600 + min_offset * 60)
  690. # Look for a timezone name between parenthesis
  691. if (i + 5 < len_l and
  692. info.jump(l[i + 2]) and l[i + 3] == '(' and
  693. l[i + 5] == ')' and
  694. 3 <= len(l[i + 4]) and
  695. self._could_be_tzname(res.hour, res.tzname,
  696. None, l[i + 4])):
  697. # -0300 (BRST)
  698. res.tzname = l[i + 4]
  699. i += 4
  700. i += 1
  701. # Check jumps
  702. elif not (info.jump(l[i]) or fuzzy):
  703. raise ValueError(timestr)
  704. else:
  705. skipped_idxs.append(i)
  706. i += 1
  707. # Process year/month/day
  708. year, month, day = ymd.resolve_ymd(yearfirst, dayfirst)
  709. res.century_specified = ymd.century_specified
  710. res.year = year
  711. res.month = month
  712. res.day = day
  713. except (IndexError, ValueError):
  714. return None, None
  715. if not info.validate(res):
  716. return None, None
  717. if fuzzy_with_tokens:
  718. skipped_tokens = self._recombine_skipped(l, skipped_idxs)
  719. return res, tuple(skipped_tokens)
  720. else:
  721. return res, None
  722. def _parse_numeric_token(self, tokens, idx, info, ymd, res, fuzzy):
  723. # Token is a number
  724. value_repr = tokens[idx]
  725. try:
  726. value = self._to_decimal(value_repr)
  727. except Exception as e:
  728. six.raise_from(ValueError('Unknown numeric token'), e)
  729. len_li = len(value_repr)
  730. len_l = len(tokens)
  731. if (len(ymd) == 3 and len_li in (2, 4) and
  732. res.hour is None and
  733. (idx + 1 >= len_l or
  734. (tokens[idx + 1] != ':' and
  735. info.hms(tokens[idx + 1]) is None))):
  736. # 19990101T23[59]
  737. s = tokens[idx]
  738. res.hour = int(s[:2])
  739. if len_li == 4:
  740. res.minute = int(s[2:])
  741. elif len_li == 6 or (len_li > 6 and tokens[idx].find('.') == 6):
  742. # YYMMDD or HHMMSS[.ss]
  743. s = tokens[idx]
  744. if not ymd and '.' not in tokens[idx]:
  745. ymd.append(s[:2])
  746. ymd.append(s[2:4])
  747. ymd.append(s[4:])
  748. else:
  749. # 19990101T235959[.59]
  750. # TODO: Check if res attributes already set.
  751. res.hour = int(s[:2])
  752. res.minute = int(s[2:4])
  753. res.second, res.microsecond = self._parsems(s[4:])
  754. elif len_li in (8, 12, 14):
  755. # YYYYMMDD
  756. s = tokens[idx]
  757. ymd.append(s[:4], 'Y')
  758. ymd.append(s[4:6])
  759. ymd.append(s[6:8])
  760. if len_li > 8:
  761. res.hour = int(s[8:10])
  762. res.minute = int(s[10:12])
  763. if len_li > 12:
  764. res.second = int(s[12:])
  765. elif self._find_hms_idx(idx, tokens, info, allow_jump=True) is not None:
  766. # HH[ ]h or MM[ ]m or SS[.ss][ ]s
  767. hms_idx = self._find_hms_idx(idx, tokens, info, allow_jump=True)
  768. (idx, hms) = self._parse_hms(idx, tokens, info, hms_idx)
  769. if hms is not None:
  770. # TODO: checking that hour/minute/second are not
  771. # already set?
  772. self._assign_hms(res, value_repr, hms)
  773. elif idx + 2 < len_l and tokens[idx + 1] == ':':
  774. # HH:MM[:SS[.ss]]
  775. res.hour = int(value)
  776. value = self._to_decimal(tokens[idx + 2]) # TODO: try/except for this?
  777. (res.minute, res.second) = self._parse_min_sec(value)
  778. if idx + 4 < len_l and tokens[idx + 3] == ':':
  779. res.second, res.microsecond = self._parsems(tokens[idx + 4])
  780. idx += 2
  781. idx += 2
  782. elif idx + 1 < len_l and tokens[idx + 1] in ('-', '/', '.'):
  783. sep = tokens[idx + 1]
  784. ymd.append(value_repr)
  785. if idx + 2 < len_l and not info.jump(tokens[idx + 2]):
  786. if tokens[idx + 2].isdigit():
  787. # 01-01[-01]
  788. ymd.append(tokens[idx + 2])
  789. else:
  790. # 01-Jan[-01]
  791. value = info.month(tokens[idx + 2])
  792. if value is not None:
  793. ymd.append(value, 'M')
  794. else:
  795. raise ValueError()
  796. if idx + 3 < len_l and tokens[idx + 3] == sep:
  797. # We have three members
  798. value = info.month(tokens[idx + 4])
  799. if value is not None:
  800. ymd.append(value, 'M')
  801. else:
  802. ymd.append(tokens[idx + 4])
  803. idx += 2
  804. idx += 1
  805. idx += 1
  806. elif idx + 1 >= len_l or info.jump(tokens[idx + 1]):
  807. if idx + 2 < len_l and info.ampm(tokens[idx + 2]) is not None:
  808. # 12 am
  809. hour = int(value)
  810. res.hour = self._adjust_ampm(hour, info.ampm(tokens[idx + 2]))
  811. idx += 1
  812. else:
  813. # Year, month or day
  814. ymd.append(value)
  815. idx += 1
  816. elif info.ampm(tokens[idx + 1]) is not None and (0 <= value < 24):
  817. # 12am
  818. hour = int(value)
  819. res.hour = self._adjust_ampm(hour, info.ampm(tokens[idx + 1]))
  820. idx += 1
  821. elif ymd.could_be_day(value):
  822. ymd.append(value)
  823. elif not fuzzy:
  824. raise ValueError()
  825. return idx
  826. def _find_hms_idx(self, idx, tokens, info, allow_jump):
  827. len_l = len(tokens)
  828. if idx+1 < len_l and info.hms(tokens[idx+1]) is not None:
  829. # There is an "h", "m", or "s" label following this token. We take
  830. # assign the upcoming label to the current token.
  831. # e.g. the "12" in 12h"
  832. hms_idx = idx + 1
  833. elif (allow_jump and idx+2 < len_l and tokens[idx+1] == ' ' and
  834. info.hms(tokens[idx+2]) is not None):
  835. # There is a space and then an "h", "m", or "s" label.
  836. # e.g. the "12" in "12 h"
  837. hms_idx = idx + 2
  838. elif idx > 0 and info.hms(tokens[idx-1]) is not None:
  839. # There is a "h", "m", or "s" preceding this token. Since neither
  840. # of the previous cases was hit, there is no label following this
  841. # token, so we use the previous label.
  842. # e.g. the "04" in "12h04"
  843. hms_idx = idx-1
  844. elif (1 < idx == len_l-1 and tokens[idx-1] == ' ' and
  845. info.hms(tokens[idx-2]) is not None):
  846. # If we are looking at the final token, we allow for a
  847. # backward-looking check to skip over a space.
  848. # TODO: Are we sure this is the right condition here?
  849. hms_idx = idx - 2
  850. else:
  851. hms_idx = None
  852. return hms_idx
  853. def _assign_hms(self, res, value_repr, hms):
  854. # See GH issue #427, fixing float rounding
  855. value = self._to_decimal(value_repr)
  856. if hms == 0:
  857. # Hour
  858. res.hour = int(value)
  859. if value % 1:
  860. res.minute = int(60*(value % 1))
  861. elif hms == 1:
  862. (res.minute, res.second) = self._parse_min_sec(value)
  863. elif hms == 2:
  864. (res.second, res.microsecond) = self._parsems(value_repr)
  865. def _could_be_tzname(self, hour, tzname, tzoffset, token):
  866. return (hour is not None and
  867. tzname is None and
  868. tzoffset is None and
  869. len(token) <= 5 and
  870. (all(x in string.ascii_uppercase for x in token)
  871. or token in self.info.UTCZONE))
  872. def _ampm_valid(self, hour, ampm, fuzzy):
  873. """
  874. For fuzzy parsing, 'a' or 'am' (both valid English words)
  875. may erroneously trigger the AM/PM flag. Deal with that
  876. here.
  877. """
  878. val_is_ampm = True
  879. # If there's already an AM/PM flag, this one isn't one.
  880. if fuzzy and ampm is not None:
  881. val_is_ampm = False
  882. # If AM/PM is found and hour is not, raise a ValueError
  883. if hour is None:
  884. if fuzzy:
  885. val_is_ampm = False
  886. else:
  887. raise ValueError('No hour specified with AM or PM flag.')
  888. elif not 0 <= hour <= 12:
  889. # If AM/PM is found, it's a 12 hour clock, so raise
  890. # an error for invalid range
  891. if fuzzy:
  892. val_is_ampm = False
  893. else:
  894. raise ValueError('Invalid hour specified for 12-hour clock.')
  895. return val_is_ampm
  896. def _adjust_ampm(self, hour, ampm):
  897. if hour < 12 and ampm == 1:
  898. hour += 12
  899. elif hour == 12 and ampm == 0:
  900. hour = 0
  901. return hour
  902. def _parse_min_sec(self, value):
  903. # TODO: Every usage of this function sets res.second to the return
  904. # value. Are there any cases where second will be returned as None and
  905. # we *don't* want to set res.second = None?
  906. minute = int(value)
  907. second = None
  908. sec_remainder = value % 1
  909. if sec_remainder:
  910. second = int(60 * sec_remainder)
  911. return (minute, second)
  912. def _parse_hms(self, idx, tokens, info, hms_idx):
  913. # TODO: Is this going to admit a lot of false-positives for when we
  914. # just happen to have digits and "h", "m" or "s" characters in non-date
  915. # text? I guess hex hashes won't have that problem, but there's plenty
  916. # of random junk out there.
  917. if hms_idx is None:
  918. hms = None
  919. new_idx = idx
  920. elif hms_idx > idx:
  921. hms = info.hms(tokens[hms_idx])
  922. new_idx = hms_idx
  923. else:
  924. # Looking backwards, increment one.
  925. hms = info.hms(tokens[hms_idx]) + 1
  926. new_idx = idx
  927. return (new_idx, hms)
  928. # ------------------------------------------------------------------
  929. # Handling for individual tokens. These are kept as methods instead
  930. # of functions for the sake of customizability via subclassing.
  931. def _parsems(self, value):
  932. """Parse a I[.F] seconds value into (seconds, microseconds)."""
  933. if "." not in value:
  934. return int(value), 0
  935. else:
  936. i, f = value.split(".")
  937. return int(i), int(f.ljust(6, "0")[:6])
  938. def _to_decimal(self, val):
  939. try:
  940. decimal_value = Decimal(val)
  941. # See GH 662, edge case, infinite value should not be converted
  942. # via `_to_decimal`
  943. if not decimal_value.is_finite():
  944. raise ValueError("Converted decimal value is infinite or NaN")
  945. except Exception as e:
  946. msg = "Could not convert %s to decimal" % val
  947. six.raise_from(ValueError(msg), e)
  948. else:
  949. return decimal_value
  950. # ------------------------------------------------------------------
  951. # Post-Parsing construction of datetime output. These are kept as
  952. # methods instead of functions for the sake of customizability via
  953. # subclassing.
  954. def _build_tzinfo(self, tzinfos, tzname, tzoffset):
  955. if callable(tzinfos):
  956. tzdata = tzinfos(tzname, tzoffset)
  957. else:
  958. tzdata = tzinfos.get(tzname)
  959. # handle case where tzinfo is paased an options that returns None
  960. # eg tzinfos = {'BRST' : None}
  961. if isinstance(tzdata, datetime.tzinfo) or tzdata is None:
  962. tzinfo = tzdata
  963. elif isinstance(tzdata, text_type):
  964. tzinfo = tz.tzstr(tzdata)
  965. elif isinstance(tzdata, integer_types):
  966. tzinfo = tz.tzoffset(tzname, tzdata)
  967. else:
  968. raise TypeError("Offset must be tzinfo subclass, tz string, "
  969. "or int offset.")
  970. return tzinfo
  971. def _build_tzaware(self, naive, res, tzinfos):
  972. if (callable(tzinfos) or (tzinfos and res.tzname in tzinfos)):
  973. tzinfo = self._build_tzinfo(tzinfos, res.tzname, res.tzoffset)
  974. aware = naive.replace(tzinfo=tzinfo)
  975. aware = self._assign_tzname(aware, res.tzname)
  976. elif res.tzname and res.tzname in time.tzname:
  977. aware = naive.replace(tzinfo=tz.tzlocal())
  978. # Handle ambiguous local datetime
  979. aware = self._assign_tzname(aware, res.tzname)
  980. # This is mostly relevant for winter GMT zones parsed in the UK
  981. if (aware.tzname() != res.tzname and
  982. res.tzname in self.info.UTCZONE):
  983. aware = aware.replace(tzinfo=tz.UTC)
  984. elif res.tzoffset == 0:
  985. aware = naive.replace(tzinfo=tz.UTC)
  986. elif res.tzoffset:
  987. aware = naive.replace(tzinfo=tz.tzoffset(res.tzname, res.tzoffset))
  988. elif not res.tzname and not res.tzoffset:
  989. # i.e. no timezone information was found.
  990. aware = naive
  991. elif res.tzname:
  992. # tz-like string was parsed but we don't know what to do
  993. # with it
  994. warnings.warn("tzname {tzname} identified but not understood. "
  995. "Pass `tzinfos` argument in order to correctly "
  996. "return a timezone-aware datetime. In a future "
  997. "version, this will raise an "
  998. "exception.".format(tzname=res.tzname),
  999. category=UnknownTimezoneWarning)
  1000. aware = naive
  1001. return aware
  1002. def _build_naive(self, res, default):
  1003. repl = {}
  1004. for attr in ("year", "month", "day", "hour",
  1005. "minute", "second", "microsecond"):
  1006. value = getattr(res, attr)
  1007. if value is not None:
  1008. repl[attr] = value
  1009. if 'day' not in repl:
  1010. # If the default day exceeds the last day of the month, fall back
  1011. # to the end of the month.
  1012. cyear = default.year if res.year is None else res.year
  1013. cmonth = default.month if res.month is None else res.month
  1014. cday = default.day if res.day is None else res.day
  1015. if cday > monthrange(cyear, cmonth)[1]:
  1016. repl['day'] = monthrange(cyear, cmonth)[1]
  1017. naive = default.replace(**repl)
  1018. if res.weekday is not None and not res.day:
  1019. naive = naive + relativedelta.relativedelta(weekday=res.weekday)
  1020. return naive
  1021. def _assign_tzname(self, dt, tzname):
  1022. if dt.tzname() != tzname:
  1023. new_dt = tz.enfold(dt, fold=1)
  1024. if new_dt.tzname() == tzname:
  1025. return new_dt
  1026. return dt
  1027. def _recombine_skipped(self, tokens, skipped_idxs):
  1028. """
  1029. >>> tokens = ["foo", " ", "bar", " ", "19June2000", "baz"]
  1030. >>> skipped_idxs = [0, 1, 2, 5]
  1031. >>> _recombine_skipped(tokens, skipped_idxs)
  1032. ["foo bar", "baz"]
  1033. """
  1034. skipped_tokens = []
  1035. for i, idx in enumerate(sorted(skipped_idxs)):
  1036. if i > 0 and idx - 1 == skipped_idxs[i - 1]:
  1037. skipped_tokens[-1] = skipped_tokens[-1] + tokens[idx]
  1038. else:
  1039. skipped_tokens.append(tokens[idx])
  1040. return skipped_tokens
  1041. DEFAULTPARSER = parser()
  1042. def parse(timestr, parserinfo=None, **kwargs):
  1043. """
  1044. Parse a string in one of the supported formats, using the
  1045. ``parserinfo`` parameters.
  1046. :param timestr:
  1047. A string containing a date/time stamp.
  1048. :param parserinfo:
  1049. A :class:`parserinfo` object containing parameters for the parser.
  1050. If ``None``, the default arguments to the :class:`parserinfo`
  1051. constructor are used.
  1052. The ``**kwargs`` parameter takes the following keyword arguments:
  1053. :param default:
  1054. The default datetime object, if this is a datetime object and not
  1055. ``None``, elements specified in ``timestr`` replace elements in the
  1056. default object.
  1057. :param ignoretz:
  1058. If set ``True``, time zones in parsed strings are ignored and a naive
  1059. :class:`datetime` object is returned.
  1060. :param tzinfos:
  1061. Additional time zone names / aliases which may be present in the
  1062. string. This argument maps time zone names (and optionally offsets
  1063. from those time zones) to time zones. This parameter can be a
  1064. dictionary with timezone aliases mapping time zone names to time
  1065. zones or a function taking two parameters (``tzname`` and
  1066. ``tzoffset``) and returning a time zone.
  1067. The timezones to which the names are mapped can be an integer
  1068. offset from UTC in seconds or a :class:`tzinfo` object.
  1069. .. doctest::
  1070. :options: +NORMALIZE_WHITESPACE
  1071. >>> from dateutil.parser import parse
  1072. >>> from dateutil.tz import gettz
  1073. >>> tzinfos = {"BRST": -7200, "CST": gettz("America/Chicago")}
  1074. >>> parse("2012-01-19 17:21:00 BRST", tzinfos=tzinfos)
  1075. datetime.datetime(2012, 1, 19, 17, 21, tzinfo=tzoffset(u'BRST', -7200))
  1076. >>> parse("2012-01-19 17:21:00 CST", tzinfos=tzinfos)
  1077. datetime.datetime(2012, 1, 19, 17, 21,
  1078. tzinfo=tzfile('/usr/share/zoneinfo/America/Chicago'))
  1079. This parameter is ignored if ``ignoretz`` is set.
  1080. :param dayfirst:
  1081. Whether to interpret the first value in an ambiguous 3-integer date
  1082. (e.g. 01/05/09) as the day (``True``) or month (``False``). If
  1083. ``yearfirst`` is set to ``True``, this distinguishes between YDM and
  1084. YMD. If set to ``None``, this value is retrieved from the current
  1085. :class:`parserinfo` object (which itself defaults to ``False``).
  1086. :param yearfirst:
  1087. Whether to interpret the first value in an ambiguous 3-integer date
  1088. (e.g. 01/05/09) as the year. If ``True``, the first number is taken to
  1089. be the year, otherwise the last number is taken to be the year. If
  1090. this is set to ``None``, the value is retrieved from the current
  1091. :class:`parserinfo` object (which itself defaults to ``False``).
  1092. :param fuzzy:
  1093. Whether to allow fuzzy parsing, allowing for string like "Today is
  1094. January 1, 2047 at 8:21:00AM".
  1095. :param fuzzy_with_tokens:
  1096. If ``True``, ``fuzzy`` is automatically set to True, and the parser
  1097. will return a tuple where the first element is the parsed
  1098. :class:`datetime.datetime` datetimestamp and the second element is
  1099. a tuple containing the portions of the string which were ignored:
  1100. .. doctest::
  1101. >>> from dateutil.parser import parse
  1102. >>> parse("Today is January 1, 2047 at 8:21:00AM", fuzzy_with_tokens=True)
  1103. (datetime.datetime(2047, 1, 1, 8, 21), (u'Today is ', u' ', u'at '))
  1104. :return:
  1105. Returns a :class:`datetime.datetime` object or, if the
  1106. ``fuzzy_with_tokens`` option is ``True``, returns a tuple, the
  1107. first element being a :class:`datetime.datetime` object, the second
  1108. a tuple containing the fuzzy tokens.
  1109. :raises ParserError:
  1110. Raised for invalid or unknown string formats, if the provided
  1111. :class:`tzinfo` is not in a valid format, or if an invalid date would
  1112. be created.
  1113. :raises OverflowError:
  1114. Raised if the parsed date exceeds the largest valid C integer on
  1115. your system.
  1116. """
  1117. if parserinfo:
  1118. return parser(parserinfo).parse(timestr, **kwargs)
  1119. else:
  1120. return DEFAULTPARSER.parse(timestr, **kwargs)
  1121. class _tzparser(object):
  1122. class _result(_resultbase):
  1123. __slots__ = ["stdabbr", "stdoffset", "dstabbr", "dstoffset",
  1124. "start", "end"]
  1125. class _attr(_resultbase):
  1126. __slots__ = ["month", "week", "weekday",
  1127. "yday", "jyday", "day", "time"]
  1128. def __repr__(self):
  1129. return self._repr("")
  1130. def __init__(self):
  1131. _resultbase.__init__(self)
  1132. self.start = self._attr()
  1133. self.end = self._attr()
  1134. def parse(self, tzstr):
  1135. res = self._result()
  1136. l = [x for x in re.split(r'([,:.]|[a-zA-Z]+|[0-9]+)',tzstr) if x]
  1137. used_idxs = list()
  1138. try:
  1139. len_l = len(l)
  1140. i = 0
  1141. while i < len_l:
  1142. # BRST+3[BRDT[+2]]
  1143. j = i
  1144. while j < len_l and not [x for x in l[j]
  1145. if x in "0123456789:,-+"]:
  1146. j += 1
  1147. if j != i:
  1148. if not res.stdabbr:
  1149. offattr = "stdoffset"
  1150. res.stdabbr = "".join(l[i:j])
  1151. else:
  1152. offattr = "dstoffset"
  1153. res.dstabbr = "".join(l[i:j])
  1154. for ii in range(j):
  1155. used_idxs.append(ii)
  1156. i = j
  1157. if (i < len_l and (l[i] in ('+', '-') or l[i][0] in
  1158. "0123456789")):
  1159. if l[i] in ('+', '-'):
  1160. # Yes, that's right. See the TZ variable
  1161. # documentation.
  1162. signal = (1, -1)[l[i] == '+']
  1163. used_idxs.append(i)
  1164. i += 1
  1165. else:
  1166. signal = -1
  1167. len_li = len(l[i])
  1168. if len_li == 4:
  1169. # -0300
  1170. setattr(res, offattr, (int(l[i][:2]) * 3600 +
  1171. int(l[i][2:]) * 60) * signal)
  1172. elif i + 1 < len_l and l[i + 1] == ':':
  1173. # -03:00
  1174. setattr(res, offattr,
  1175. (int(l[i]) * 3600 +
  1176. int(l[i + 2]) * 60) * signal)
  1177. used_idxs.append(i)
  1178. i += 2
  1179. elif len_li <= 2:
  1180. # -[0]3
  1181. setattr(res, offattr,
  1182. int(l[i][:2]) * 3600 * signal)
  1183. else:
  1184. return None
  1185. used_idxs.append(i)
  1186. i += 1
  1187. if res.dstabbr:
  1188. break
  1189. else:
  1190. break
  1191. if i < len_l:
  1192. for j in range(i, len_l):
  1193. if l[j] == ';':
  1194. l[j] = ','
  1195. assert l[i] == ','
  1196. i += 1
  1197. if i >= len_l:
  1198. pass
  1199. elif (8 <= l.count(',') <= 9 and
  1200. not [y for x in l[i:] if x != ','
  1201. for y in x if y not in "0123456789+-"]):
  1202. # GMT0BST,3,0,30,3600,10,0,26,7200[,3600]
  1203. for x in (res.start, res.end):
  1204. x.month = int(l[i])
  1205. used_idxs.append(i)
  1206. i += 2
  1207. if l[i] == '-':
  1208. value = int(l[i + 1]) * -1
  1209. used_idxs.append(i)
  1210. i += 1
  1211. else:
  1212. value = int(l[i])
  1213. used_idxs.append(i)
  1214. i += 2
  1215. if value:
  1216. x.week = value
  1217. x.weekday = (int(l[i]) - 1) % 7
  1218. else:
  1219. x.day = int(l[i])
  1220. used_idxs.append(i)
  1221. i += 2
  1222. x.time = int(l[i])
  1223. used_idxs.append(i)
  1224. i += 2
  1225. if i < len_l:
  1226. if l[i] in ('-', '+'):
  1227. signal = (-1, 1)[l[i] == "+"]
  1228. used_idxs.append(i)
  1229. i += 1
  1230. else:
  1231. signal = 1
  1232. used_idxs.append(i)
  1233. res.dstoffset = (res.stdoffset + int(l[i]) * signal)
  1234. # This was a made-up format that is not in normal use
  1235. warn(('Parsed time zone "%s"' % tzstr) +
  1236. 'is in a non-standard dateutil-specific format, which ' +
  1237. 'is now deprecated; support for parsing this format ' +
  1238. 'will be removed in future versions. It is recommended ' +
  1239. 'that you switch to a standard format like the GNU ' +
  1240. 'TZ variable format.', tz.DeprecatedTzFormatWarning)
  1241. elif (l.count(',') == 2 and l[i:].count('/') <= 2 and
  1242. not [y for x in l[i:] if x not in (',', '/', 'J', 'M',
  1243. '.', '-', ':')
  1244. for y in x if y not in "0123456789"]):
  1245. for x in (res.start, res.end):
  1246. if l[i] == 'J':
  1247. # non-leap year day (1 based)
  1248. used_idxs.append(i)
  1249. i += 1
  1250. x.jyday = int(l[i])
  1251. elif l[i] == 'M':
  1252. # month[-.]week[-.]weekday
  1253. used_idxs.append(i)
  1254. i += 1
  1255. x.month = int(l[i])
  1256. used_idxs.append(i)
  1257. i += 1
  1258. assert l[i] in ('-', '.')
  1259. used_idxs.append(i)
  1260. i += 1
  1261. x.week = int(l[i])
  1262. if x.week == 5:
  1263. x.week = -1
  1264. used_idxs.append(i)
  1265. i += 1
  1266. assert l[i] in ('-', '.')
  1267. used_idxs.append(i)
  1268. i += 1
  1269. x.weekday = (int(l[i]) - 1) % 7
  1270. else:
  1271. # year day (zero based)
  1272. x.yday = int(l[i]) + 1
  1273. used_idxs.append(i)
  1274. i += 1
  1275. if i < len_l and l[i] == '/':
  1276. used_idxs.append(i)
  1277. i += 1
  1278. # start time
  1279. len_li = len(l[i])
  1280. if len_li == 4:
  1281. # -0300
  1282. x.time = (int(l[i][:2]) * 3600 +
  1283. int(l[i][2:]) * 60)
  1284. elif i + 1 < len_l and l[i + 1] == ':':
  1285. # -03:00
  1286. x.time = int(l[i]) * 3600 + int(l[i + 2]) * 60
  1287. used_idxs.append(i)
  1288. i += 2
  1289. if i + 1 < len_l and l[i + 1] == ':':
  1290. used_idxs.append(i)
  1291. i += 2
  1292. x.time += int(l[i])
  1293. elif len_li <= 2:
  1294. # -[0]3
  1295. x.time = (int(l[i][:2]) * 3600)
  1296. else:
  1297. return None
  1298. used_idxs.append(i)
  1299. i += 1
  1300. assert i == len_l or l[i] == ','
  1301. i += 1
  1302. assert i >= len_l
  1303. except (IndexError, ValueError, AssertionError):
  1304. return None
  1305. unused_idxs = set(range(len_l)).difference(used_idxs)
  1306. res.any_unused_tokens = not {l[n] for n in unused_idxs}.issubset({",",":"})
  1307. return res
  1308. DEFAULTTZPARSER = _tzparser()
  1309. def _parsetz(tzstr):
  1310. return DEFAULTTZPARSER.parse(tzstr)
  1311. class ParserError(ValueError):
  1312. """Exception subclass used for any failure to parse a datetime string.
  1313. This is a subclass of :py:exc:`ValueError`, and should be raised any time
  1314. earlier versions of ``dateutil`` would have raised ``ValueError``.
  1315. .. versionadded:: 2.8.1
  1316. """
  1317. def __str__(self):
  1318. try:
  1319. return self.args[0] % self.args[1:]
  1320. except (TypeError, IndexError):
  1321. return super(ParserError, self).__str__()
  1322. def __repr__(self):
  1323. args = ", ".join("'%s'" % arg for arg in self.args)
  1324. return "%s(%s)" % (self.__class__.__name__, args)
  1325. class UnknownTimezoneWarning(RuntimeWarning):
  1326. """Raised when the parser finds a timezone it cannot parse into a tzinfo.
  1327. .. versionadded:: 2.7.0
  1328. """
  1329. # vim:ts=4:sw=4:et