When doing test-crawling, ignore variations of the 'next=' query arg. (The code ignores other query args if 'next' is given).
- Legacy-Id: 18730
This commit is contained in:
parent
9a8c6ae3f4
commit
7ee6bd4fb4
|
@ -83,6 +83,9 @@ def strip_url(url):
|
|||
fragment_url = re.search("^(.+)#[a-z_.-]+$", url)
|
||||
if fragment_url:
|
||||
url = fragment_url.group(1)
|
||||
next_url = re.search(r"^(.+)\?next=.+$", url)
|
||||
if next_url:
|
||||
url = next_url.group(1)
|
||||
return url
|
||||
|
||||
def extract_html_urls(content):
|
||||
|
|
Loading…
Reference in a new issue