Tweaked the author extraction code to handle company names in the author list on the first page, when the company names contain a comma, such as for instance 'Foo Bar, Inc'.

- Legacy-Id: 4781
This commit is contained in:
Henrik Levkowetz 2012-08-22 12:52:32 +00:00
parent c90a26ca50
commit 7467fa48a5

View file

@ -450,8 +450,7 @@ class Draft():
r"(?:, | )([Ee]d\.?|\([Ee]d\.?\)|[Ee]ditor)$",
]
companyformats = [
r" {6}(([A-Za-z'][-A-Za-z0-9.& ']+)(,? ?Inc\.?))$",
r" {6}(([A-Za-z'][-A-Za-z0-9.& ']+)(,? ?Ltd\.?))$",
r" {6}(([A-Za-z'][-A-Za-z0-9.& ']+)(,? ?(Inc|Ltd|AB|S\.A)\.?))$",
r" {6}(([A-Za-z'][-A-Za-z0-9.& ']+)(/([A-Za-z'][-A-Za-z0-9.& ']+))+)$",
r" {6}([a-z0-9.-]+)$",
r" {6}(([A-Za-z'][-A-Za-z0-9.&']+)( [A-Za-z'][-A-Za-z0-9.&']+)*)$",
@ -503,6 +502,7 @@ class Draft():
for line in self.lines[:30]:
self._docheader += line+"\n"
author_on_line = False
company_on_line = False
_debug( "**" + line)
leading_space = len(re.findall("^ *", line)[0])
line_len = len(line.rstrip())
@ -526,7 +526,7 @@ class Draft():
for lineformat, authformat in multiauthformats:
match = re.search(lineformat, line)
if match:
_debug("Multiauth format: '%s'" % lineformat)
_debug("a. Multiauth format: '%s'" % lineformat)
author_list = re.findall(authformat, line)
authors += [ a[0] for a in author_list ]
companies += [ None for a in author_list ]
@ -540,22 +540,28 @@ class Draft():
for lineformat in authcompanyformats:
match = re.search(lineformat, line)
if match:
_debug("Line format: '%s'" % lineformat)
author = match.group("author")
company = match.group("company")
authors += [ author, '']
companies += [ None, company ]
#_debug("\nLine: " + line)
#_debug("Format: " + authformat)
_debug("Author: '%s'" % author)
_debug("Company: '%s'" % company)
author_on_line = True
break
_debug("b. Line format: '%s'" % lineformat)
maybe_company = match.group("company").strip(" ,.")
# is the putative company name just a partial name, i.e., a part
# that commonly occurs after a comma as part of a company name,
# as in "Foo Bar, Inc."? If so, skip; else assume there's a
# company name after the comma.
if not maybe_company in ["Inc", "Ltd", "S.A", "AG", "AB", "N.V", ]:
author = match.group("author")
company = match.group("company")
authors += [ author, '']
companies += [ None, company ]
#_debug("\nLine: " + line)
#_debug("Format: " + authformat)
_debug("Author: '%s'" % author)
_debug("Company: '%s'" % company)
author_on_line = True
break
if not author_on_line:
for authformat in authformats:
match = re.search(authformat, line)
if match:
_debug("Auth format: '%s'" % authformat)
_debug("c. Auth format: '%s'" % authformat)
author = match.group(1)
authors += [ author ]
companies += [ None ]
@ -568,10 +574,11 @@ class Draft():
for authformat in companyformats:
match = re.search(authformat, line)
if match:
_debug("Auth format: '%s'" % authformat)
_debug("d. Company format: '%s'" % authformat)
company = match.group(1)
authors += [ "" ]
companies += [ company ]
company_on_line = True
#_debug("\nLine: " + line)
#_debug("Format: " + authformat)
_debug("Company: '%s'" % company)
@ -582,7 +589,7 @@ class Draft():
companies += [ "" ]
if line.strip() == "":
if prev_blankline and authors:
_debug("Breaking for having found consecutive blank lines after author name")
_debug("Breaking, having found consecutive blank lines after author name")
break
if authors:
have_blankline = True
@ -592,7 +599,7 @@ class Draft():
if "draft-" in line:
have_draftline = True
if have_blankline and have_draftline:
_debug("Breaking for having found both blank line and draft-name line")
_debug("Breaking, having found both blank line and draft-name line")
break
# remove trailing blank entries in the author list:
@ -607,6 +614,8 @@ class Draft():
#companies = [ None if a else '' for a in authors ]
#_debug("B:companies : %s" % str(companies))
#find authors' addresses section if it exists
_debug("B:authors : %s" % str(authors))
last_line = len(self.lines)-1
address_section_pos = last_line/2
for i in range(last_line/2,last_line):
@ -990,7 +999,12 @@ def _output(docname, fields, outfile=sys.stdout):
else:
if opt_attributes:
def outputkey(key, fields):
outfile.write("%-24s: %s\n" % ( key, fields[key].strip().replace("\\", "\\\\" ).replace("'", "\\x27" )))
field = fields[key]
if "\n" in field:
field = "\n" + field.rstrip()
else:
field = field.strip()
outfile.write("%-24s: %s\n" % ( key, field.replace("\\", "\\\\" ).replace("'", "\\x27" )))
else:
def outputkey(key, fields):
outfile.write(" %s='%s'" % ( key.lower(), fields[key].strip().replace("\\", "\\\\" ).replace("'", "\\x27" ).replace("\n", "\\n")))