Improve test-crawler regexp so it can catch and visit linked feed URLs
- Legacy-Id: 7104
This commit is contained in:
parent
b4dfae121b
commit
5dcd140a63
|
@ -33,7 +33,7 @@ def strip_url(url):
|
|||
return url
|
||||
|
||||
def extract_html_urls(content):
|
||||
for m in re.finditer(r'<a.*href="([^"]+)">', content):
|
||||
for m in re.finditer(r'<(?:a|link) [^>]*href="([^"]+)"', content):
|
||||
url = strip_url(m.group(1))
|
||||
if len(url) > MAX_URL_LENGTH:
|
||||
continue # avoid infinite GET parameter appendages
|
||||
|
|
Loading…
Reference in a new issue