Improve test-crawler regexp so it can catch and visit linked feed URLs

- Legacy-Id: 7104
This commit is contained in:
Ole Laursen 2014-01-10 17:34:33 +00:00
parent b4dfae121b
commit 5dcd140a63

View file

@ -33,7 +33,7 @@ def strip_url(url):
return url
def extract_html_urls(content):
for m in re.finditer(r'<a.*href="([^"]+)">', content):
for m in re.finditer(r'<(?:a|link) [^>]*href="([^"]+)"', content):
url = strip_url(m.group(1))
if len(url) > MAX_URL_LENGTH:
continue # avoid infinite GET parameter appendages