.. versionadded:: 3.6
+ .. method:: site_maps()
+
+ Returns the contents of the ``Sitemap`` parameter from
+ ``robots.txt`` in the form of a :func:`list`. If there is no such
+ parameter or the ``robots.txt`` entry for this parameter has
+ invalid syntax, return ``None``.
+
+ .. versionadded:: 3.8
+
The following example demonstrates basic use of the :class:`RobotFileParser`
class::
agent = 'test_robotparser'
good = []
bad = []
+ site_maps = None
def setUp(self):
lines = io.StringIO(self.robots_txt).readlines()
with self.subTest(url=url, agent=agent):
self.assertFalse(self.parser.can_fetch(agent, url))
+ def test_site_maps(self):
+ self.assertEqual(self.parser.site_maps(), self.site_maps)
+
class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
bad = ['/cyberworld/map/index.html']
+class SitemapTest(BaseRobotTest, unittest.TestCase):
+ robots_txt = """\
+# robots.txt for http://www.example.com/
+
+User-agent: *
+Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
+Sitemap: http://www.google.com/hostednews/sitemap_index.xml
+Request-rate: 3/15
+Disallow: /cyberworld/map/ # This is an infinite virtual URL space
+
+ """
+ good = ['/', '/test.html']
+ bad = ['/cyberworld/map/index.html']
+ site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
+ 'http://www.google.com/hostednews/sitemap_index.xml']
+
+
class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
# go away
def __init__(self, url=''):
self.entries = []
+ self.sitemaps = []
self.default_entry = None
self.disallow_all = False
self.allow_all = False
and numbers[1].strip().isdigit()):
entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
state = 2
+ elif line[0] == "sitemap":
+ # According to http://www.sitemaps.org/protocol.html
+ # "This directive is independent of the user-agent line,
+ # so it doesn't matter where you place it in your file."
+ # Therefore we do not change the state of the parser.
+ self.sitemaps.append(line[1])
if state == 2:
self._add_entry(entry)
return entry.req_rate
return self.default_entry.req_rate
+ def site_maps(self):
+ if not self.sitemaps:
+ return None
+ return self.sitemaps
+
def __str__(self):
entries = self.entries
if self.default_entry is not None:
Mike Bayer
Samuel L. Bayer
Bo Bayles
+Christopher Beacham AKA Lady Red
Tommy Beadle
Donald Beaudry
David Beazley
Blake Winton
Jean-Claude Wippler
Stéphane Wirtel
+Peter Wirtz
Lars Wirzenius
John Wiseman
Chris Withers
--- /dev/null
+Added support for Site Maps to urllib's ``RobotFileParser`` as
+:meth:`RobotFileParser.site_maps() <urllib.robotparser.RobotFileParser.site_maps>`.
+Patch by Lady Red, based on patch by Peter Wirtz.