When doing test-crawling, ignore variations of the 'next=' query arg. (The code ignores other query args if 'next' is given).
- Legacy-Id: 18730
This commit is contained in:
parent
9a8c6ae3f4
commit
7ee6bd4fb4
|
@ -83,6 +83,9 @@ def strip_url(url):
|
||||||
fragment_url = re.search("^(.+)#[a-z_.-]+$", url)
|
fragment_url = re.search("^(.+)#[a-z_.-]+$", url)
|
||||||
if fragment_url:
|
if fragment_url:
|
||||||
url = fragment_url.group(1)
|
url = fragment_url.group(1)
|
||||||
|
next_url = re.search(r"^(.+)\?next=.+$", url)
|
||||||
|
if next_url:
|
||||||
|
url = next_url.group(1)
|
||||||
return url
|
return url
|
||||||
|
|
||||||
def extract_html_urls(content):
|
def extract_html_urls(content):
|
||||||
|
|
Loading…
Reference in a new issue