]> granicus.if.org Git - python/commitdiff
Close issue 3437 - missing state change when Allow lines are processed.
authorSkip Montanaro <skip@pobox.com>
Sun, 27 Jul 2008 00:49:02 +0000 (00:49 +0000)
committerSkip Montanaro <skip@pobox.com>
Sun, 27 Jul 2008 00:49:02 +0000 (00:49 +0000)
Adds test cases which use Allow: as well.

Lib/robotparser.py
Lib/test/test_robotparser.py

index f249187e2c5ab0ed4e92147a00c4214a3b5a1b05..447563fe654d86db1fe8e26c83bfd088d24bf378 100644 (file)
@@ -76,6 +76,10 @@ class RobotFileParser:
         """parse the input lines from a robots.txt file.
            We allow that a user-agent: line is not preceded by
            one or more blank lines."""
+        # states:
+        #   0: start state
+        #   1: saw user-agent line
+        #   2: saw an allow or disallow line
         state = 0
         linenumber = 0
         entry = Entry()
@@ -114,6 +118,7 @@ class RobotFileParser:
                 elif line[0] == "allow":
                     if state != 0:
                         entry.rulelines.append(RuleLine(line[1], True))
+                        state = 2
         if state == 2:
             self.entries.append(entry)
 
index b7911fd61c4560215d452a8ea91b1981797fe9cc..431b8ffbd98e9e7ac02a1a4dd6fc70aa295a4daf 100644 (file)
@@ -134,6 +134,75 @@ bad = [] # Bug report says "/" should be denied, but that is not in the RFC
 
 RobotTest(7, doc, good, bad)
 
+# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
+
+# 8.
+doc = """
+User-agent: Googlebot
+Allow: /folder1/myfile.html
+Disallow: /folder1/
+"""
+
+good = ['/folder1/myfile.html']
+bad = ['/folder1/anotherfile.html']
+
+RobotTest(8, doc, good, bad, agent="Googlebot")
+
+# 9.  This file is incorrect because "Googlebot" is a substring of
+#     "Googlebot-Mobile", so test 10 works just like test 9.
+doc = """
+User-agent: Googlebot
+Disallow: /
+
+User-agent: Googlebot-Mobile
+Allow: /
+"""
+
+good = []
+bad = ['/something.jpg']
+
+RobotTest(9, doc, good, bad, agent="Googlebot")
+
+good = []
+bad = ['/something.jpg']
+
+RobotTest(10, doc, good, bad, agent="Googlebot-Mobile")
+
+# 11.  Get the order correct.
+doc = """
+User-agent: Googlebot-Mobile
+Allow: /
+
+User-agent: Googlebot
+Disallow: /
+"""
+
+good = []
+bad = ['/something.jpg']
+
+RobotTest(11, doc, good, bad, agent="Googlebot")
+
+good = ['/something.jpg']
+bad = []
+
+RobotTest(12, doc, good, bad, agent="Googlebot-Mobile")
+
+
+# 13.  Google also got the order wrong in #8.  You need to specify the
+#      URLs from more specific to more general.
+doc = """
+User-agent: Googlebot
+Allow: /folder1/myfile.html
+Disallow: /folder1/
+"""
+
+good = ['/folder1/myfile.html']
+bad = ['/folder1/anotherfile.html']
+
+RobotTest(13, doc, good, bad, agent="googlebot")
+
+
+
 class TestCase(unittest.TestCase):
     def runTest(self):
         test_support.requires('network')