From 7ee6bd4fb4280021a9132ed47007b2d9816a385c Mon Sep 17 00:00:00 2001 From: Henrik Levkowetz Date: Fri, 4 Dec 2020 16:04:01 +0000 Subject: [PATCH] When doing test-crawling, ignore variations of the 'next=' query arg. (The code ignores other query args if 'next' is given). - Legacy-Id: 18730 --- bin/test-crawl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bin/test-crawl b/bin/test-crawl index 4e60d820c..416f60f34 100755 --- a/bin/test-crawl +++ b/bin/test-crawl @@ -83,6 +83,9 @@ def strip_url(url): fragment_url = re.search("^(.+)#[a-z_.-]+$", url) if fragment_url: url = fragment_url.group(1) + next_url = re.search(r"^(.+)\?next=.+$", url) + if next_url: + url = next_url.group(1) return url def extract_html_urls(content):