When doing test-crawling, ignore variations of the 'next=' query arg. (The code ignores other query args if 'next' is given).

- Legacy-Id: 18730
This commit is contained in:
Henrik Levkowetz 2020-12-04 16:04:01 +00:00
parent 9a8c6ae3f4
commit 7ee6bd4fb4

View file

@ -83,6 +83,9 @@ def strip_url(url):
fragment_url = re.search("^(.+)#[a-z_.-]+$", url)
if fragment_url:
url = fragment_url.group(1)
next_url = re.search(r"^(.+)\?next=.+$", url)
if next_url:
url = next_url.group(1)
return url
def extract_html_urls(content):