robotparser.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. """ robotparser.py
  2. Copyright (C) 2000 Bastian Kleineidam
  3. You can choose between two licenses when using this package:
  4. 1) GNU GPLv2
  5. 2) PSF license for Python 2.2
  6. The robots.txt Exclusion Protocol is implemented as specified in
  7. http://www.robotstxt.org/norobots-rfc.txt
  8. """
  9. import collections
  10. import urllib.parse
  11. import urllib.request
  12. __all__ = ["RobotFileParser"]
  13. RequestRate = collections.namedtuple("RequestRate", "requests seconds")
  14. class RobotFileParser:
  15. """ This class provides a set of methods to read, parse and answer
  16. questions about a single robots.txt file.
  17. """
  18. def __init__(self, url=''):
  19. self.entries = []
  20. self.sitemaps = []
  21. self.default_entry = None
  22. self.disallow_all = False
  23. self.allow_all = False
  24. self.set_url(url)
  25. self.last_checked = 0
  26. def mtime(self):
  27. """Returns the time the robots.txt file was last fetched.
  28. This is useful for long-running web spiders that need to
  29. check for new robots.txt files periodically.
  30. """
  31. return self.last_checked
  32. def modified(self):
  33. """Sets the time the robots.txt file was last fetched to the
  34. current time.
  35. """
  36. import time
  37. self.last_checked = time.time()
  38. def set_url(self, url):
  39. """Sets the URL referring to a robots.txt file."""
  40. self.url = url
  41. self.host, self.path = urllib.parse.urlparse(url)[1:3]
  42. def read(self):
  43. """Reads the robots.txt URL and feeds it to the parser."""
  44. try:
  45. f = urllib.request.urlopen(self.url)
  46. except urllib.error.HTTPError as err:
  47. if err.code in (401, 403):
  48. self.disallow_all = True
  49. elif err.code >= 400 and err.code < 500:
  50. self.allow_all = True
  51. else:
  52. raw = f.read()
  53. self.parse(raw.decode("utf-8").splitlines())
  54. def _add_entry(self, entry):
  55. if "*" in entry.useragents:
  56. # the default entry is considered last
  57. if self.default_entry is None:
  58. # the first default entry wins
  59. self.default_entry = entry
  60. else:
  61. self.entries.append(entry)
  62. def parse(self, lines):
  63. """Parse the input lines from a robots.txt file.
  64. We allow that a user-agent: line is not preceded by
  65. one or more blank lines.
  66. """
  67. # states:
  68. # 0: start state
  69. # 1: saw user-agent line
  70. # 2: saw an allow or disallow line
  71. state = 0
  72. entry = Entry()
  73. self.modified()
  74. for line in lines:
  75. if not line:
  76. if state == 1:
  77. entry = Entry()
  78. state = 0
  79. elif state == 2:
  80. self._add_entry(entry)
  81. entry = Entry()
  82. state = 0
  83. # remove optional comment and strip line
  84. i = line.find('#')
  85. if i >= 0:
  86. line = line[:i]
  87. line = line.strip()
  88. if not line:
  89. continue
  90. line = line.split(':', 1)
  91. if len(line) == 2:
  92. line[0] = line[0].strip().lower()
  93. line[1] = urllib.parse.unquote(line[1].strip())
  94. if line[0] == "user-agent":
  95. if state == 2:
  96. self._add_entry(entry)
  97. entry = Entry()
  98. entry.useragents.append(line[1])
  99. state = 1
  100. elif line[0] == "disallow":
  101. if state != 0:
  102. entry.rulelines.append(RuleLine(line[1], False))
  103. state = 2
  104. elif line[0] == "allow":
  105. if state != 0:
  106. entry.rulelines.append(RuleLine(line[1], True))
  107. state = 2
  108. elif line[0] == "crawl-delay":
  109. if state != 0:
  110. # before trying to convert to int we need to make
  111. # sure that robots.txt has valid syntax otherwise
  112. # it will crash
  113. if line[1].strip().isdigit():
  114. entry.delay = int(line[1])
  115. state = 2
  116. elif line[0] == "request-rate":
  117. if state != 0:
  118. numbers = line[1].split('/')
  119. # check if all values are sane
  120. if (len(numbers) == 2 and numbers[0].strip().isdigit()
  121. and numbers[1].strip().isdigit()):
  122. entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
  123. state = 2
  124. elif line[0] == "sitemap":
  125. # According to http://www.sitemaps.org/protocol.html
  126. # "This directive is independent of the user-agent line,
  127. # so it doesn't matter where you place it in your file."
  128. # Therefore we do not change the state of the parser.
  129. self.sitemaps.append(line[1])
  130. if state == 2:
  131. self._add_entry(entry)
  132. def can_fetch(self, useragent, url):
  133. """using the parsed robots.txt decide if useragent can fetch url"""
  134. if self.disallow_all:
  135. return False
  136. if self.allow_all:
  137. return True
  138. # Until the robots.txt file has been read or found not
  139. # to exist, we must assume that no url is allowable.
  140. # This prevents false positives when a user erroneously
  141. # calls can_fetch() before calling read().
  142. if not self.last_checked:
  143. return False
  144. # search for given user agent matches
  145. # the first match counts
  146. parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
  147. url = urllib.parse.urlunparse(('','',parsed_url.path,
  148. parsed_url.params,parsed_url.query, parsed_url.fragment))
  149. url = urllib.parse.quote(url)
  150. if not url:
  151. url = "/"
  152. for entry in self.entries:
  153. if entry.applies_to(useragent):
  154. return entry.allowance(url)
  155. # try the default entry last
  156. if self.default_entry:
  157. return self.default_entry.allowance(url)
  158. # agent not found ==> access granted
  159. return True
  160. def crawl_delay(self, useragent):
  161. if not self.mtime():
  162. return None
  163. for entry in self.entries:
  164. if entry.applies_to(useragent):
  165. return entry.delay
  166. if self.default_entry:
  167. return self.default_entry.delay
  168. return None
  169. def request_rate(self, useragent):
  170. if not self.mtime():
  171. return None
  172. for entry in self.entries:
  173. if entry.applies_to(useragent):
  174. return entry.req_rate
  175. if self.default_entry:
  176. return self.default_entry.req_rate
  177. return None
  178. def site_maps(self):
  179. if not self.sitemaps:
  180. return None
  181. return self.sitemaps
  182. def __str__(self):
  183. entries = self.entries
  184. if self.default_entry is not None:
  185. entries = entries + [self.default_entry]
  186. return '\n\n'.join(map(str, entries))
  187. class RuleLine:
  188. """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
  189. (allowance==False) followed by a path."""
  190. def __init__(self, path, allowance):
  191. if path == '' and not allowance:
  192. # an empty value means allow all
  193. allowance = True
  194. path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
  195. self.path = urllib.parse.quote(path)
  196. self.allowance = allowance
  197. def applies_to(self, filename):
  198. return self.path == "*" or filename.startswith(self.path)
  199. def __str__(self):
  200. return ("Allow" if self.allowance else "Disallow") + ": " + self.path
  201. class Entry:
  202. """An entry has one or more user-agents and zero or more rulelines"""
  203. def __init__(self):
  204. self.useragents = []
  205. self.rulelines = []
  206. self.delay = None
  207. self.req_rate = None
  208. def __str__(self):
  209. ret = []
  210. for agent in self.useragents:
  211. ret.append(f"User-agent: {agent}")
  212. if self.delay is not None:
  213. ret.append(f"Crawl-delay: {self.delay}")
  214. if self.req_rate is not None:
  215. rate = self.req_rate
  216. ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
  217. ret.extend(map(str, self.rulelines))
  218. return '\n'.join(ret)
  219. def applies_to(self, useragent):
  220. """check if this entry applies to the specified agent"""
  221. # split the name token and make it lower case
  222. useragent = useragent.split("/")[0].lower()
  223. for agent in self.useragents:
  224. if agent == '*':
  225. # we have the catch-all agent
  226. return True
  227. agent = agent.lower()
  228. if agent in useragent:
  229. return True
  230. return False
  231. def allowance(self, filename):
  232. """Preconditions:
  233. - our agent applies to this entry
  234. - filename is URL decoded"""
  235. for line in self.rulelines:
  236. if line.applies_to(filename):
  237. return line.allowance
  238. return True