* fix: close open things * fix: clean up test created files * fix: remove one close too many
134 lines
4.7 KiB
Python
134 lines
4.7 KiB
Python
# Copyright The IETF Trust 2016-2020, All Rights Reserved
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
# various utilities for working with the mailarch mail archive at
|
|
# mailarchive.ietf.org
|
|
|
|
import base64
|
|
import contextlib
|
|
import datetime
|
|
import email.utils
|
|
import hashlib
|
|
import mailbox
|
|
import tarfile
|
|
import tempfile
|
|
|
|
from urllib.parse import urlencode
|
|
from urllib.request import urlopen
|
|
|
|
import debug # pyflakes:ignore
|
|
|
|
from pyquery import PyQuery
|
|
|
|
from django.conf import settings
|
|
from django.utils.encoding import force_bytes, force_str
|
|
|
|
from ietf.utils.mail import get_payload_text
|
|
from ietf.utils.timezone import date_today
|
|
|
|
|
|
def list_name_from_email(list_email):
|
|
if not list_email.endswith("@ietf.org"):
|
|
return None
|
|
|
|
return list_email[:-len("@ietf.org")]
|
|
|
|
def hash_list_message_id(list_name, msgid):
|
|
# hash in mailarch is computed similar to
|
|
# https://www.mail-archive.com/faq.html#listserver except the list
|
|
# name (without "@ietf.org") is used instead of the full address,
|
|
# and rightmost "=" signs are (optionally) stripped
|
|
sha = hashlib.sha1(force_bytes(msgid))
|
|
sha.update(force_bytes(list_name))
|
|
return force_str(base64.urlsafe_b64encode(sha.digest()).rstrip(b"="))
|
|
|
|
def construct_query_urls(doc, team, query=None):
|
|
list_name = list_name_from_email(team.list_email)
|
|
if not list_name:
|
|
return None
|
|
|
|
if not query:
|
|
query = doc.name
|
|
|
|
encoded_query = "?" + urlencode({
|
|
"qdr": "c", # custom time frame
|
|
"start_date": (date_today() - datetime.timedelta(days=180)).isoformat(),
|
|
"email_list": list_name,
|
|
"q": "subject:({})".format(query),
|
|
"as": "1", # this is an advanced search
|
|
})
|
|
|
|
return {
|
|
"query": query,
|
|
"query_url": settings.MAILING_LIST_ARCHIVE_URL + "/arch/search/" + encoded_query,
|
|
"query_data_url": settings.MAILING_LIST_ARCHIVE_URL + "/arch/export/mbox/" + encoded_query,
|
|
}
|
|
|
|
def construct_message_url(list_name, msgid):
|
|
return "{}/arch/msg/{}/{}".format(settings.MAILING_LIST_ARCHIVE_URL, list_name, hash_list_message_id(list_name, msgid))
|
|
|
|
def retrieve_messages_from_mbox(mbox_fileobj):
|
|
"""Return selected content in message from mbox from mailarch."""
|
|
res = []
|
|
with tempfile.NamedTemporaryFile(suffix=".mbox") as mbox_file:
|
|
# mailbox.mbox needs a path, so we need to put the contents
|
|
# into a file
|
|
mbox_data = mbox_fileobj.read()
|
|
mbox_file.write(mbox_data)
|
|
mbox_file.flush()
|
|
|
|
mbox = mailbox.mbox(mbox_file.name, create=False)
|
|
for msg in mbox:
|
|
content = ""
|
|
|
|
for part in msg.walk():
|
|
if part.get_content_type() == "text/plain":
|
|
charset = part.get_content_charset() or "utf-8"
|
|
content += get_payload_text(part, default_charset=charset)
|
|
|
|
# parse a couple of things for the front end
|
|
utcdate = None
|
|
d = email.utils.parsedate_tz(msg["Date"])
|
|
if d:
|
|
utcdate = datetime.datetime.fromtimestamp(email.utils.mktime_tz(d), datetime.timezone.utc)
|
|
|
|
res.append({
|
|
"from": msg["From"],
|
|
"splitfrom": email.utils.parseaddr(msg["From"]),
|
|
"subject": msg["Subject"],
|
|
"content": content.replace("\r\n", "\n").replace("\r", "\n").strip("\n"),
|
|
"message_id": email.utils.unquote(msg["Message-ID"].strip()),
|
|
"url": email.utils.unquote(msg["Archived-At"].strip()),
|
|
"date": msg["Date"],
|
|
"utcdate": (utcdate.date().isoformat(), utcdate.time().isoformat()) if utcdate else ("", ""),
|
|
})
|
|
mbox.close()
|
|
|
|
return res
|
|
|
|
def retrieve_messages(query_data_url):
|
|
"""Retrieve and return selected content from mailarch."""
|
|
res = []
|
|
|
|
# This has not been rewritten to use requests.get() because get() does
|
|
# not handle file URLs out of the box, which we need for tesing
|
|
with contextlib.closing(urlopen(query_data_url, timeout=15)) as fileobj:
|
|
content_type = fileobj.info()["Content-type"]
|
|
if not content_type.startswith("application/x-tar"):
|
|
if content_type.startswith("text/html"):
|
|
r = fileobj.read(20000)
|
|
q = PyQuery(r)
|
|
div = q('div[class~="no-results"]')
|
|
if div:
|
|
raise KeyError("No results: %s -> %s" % (query_data_url, div.text(), ))
|
|
raise Exception("Export failed - this usually means no matches were found")
|
|
|
|
with tarfile.open(fileobj=fileobj, mode='r|*') as tar:
|
|
for entry in tar:
|
|
if entry.isfile():
|
|
mbox_fileobj = tar.extractfile(entry)
|
|
res.extend(retrieve_messages_from_mbox(mbox_fileobj))
|
|
|
|
return res
|