Added support for reverse-order (i.e., Japanese, Chinese, and other) names with uppercase family name in the draft submission author extraction.

- Legacy-Id: 4949
This commit is contained in:
Henrik Levkowetz 2012-10-23 12:33:21 +00:00
parent 61ad24dde5
commit 45585957ef

View file

@ -40,7 +40,7 @@ import stat
import sys
import time
version = "0.27"
version = "0.28"
program = os.path.basename(sys.argv[0])
progdir = os.path.dirname(sys.argv[0])
@ -649,6 +649,9 @@ class Draft():
first, last = author.rsplit(" ", 1)
else:
first, last = author.rsplit(" ", 1)
if "." in first and not ". " in first:
first = first.replace(".", ". ").strip()
prefix_match = re.search(" %(prefix)s$" % aux, first)
if prefix_match:
prefix = prefix_match.group(1)
@ -656,85 +659,97 @@ class Draft():
last = prefix+" "+last
_debug("First, Last: '%s' '%s'" % (first, last))
for firstname, surname, casefixname in [ (first,last,last), (last,first,first), (first,last,last.upper()), (last,first,first.upper()), ]:
author = "%s %s" % (firstname, casefixname)
_debug("\nAuthors: "+str(authors))
_debug("Author: "+author)
for left, right in [(firstname, casefixname), (casefixname, firstname)]:
author = "%s %s" % (left, right)
_debug("\nAuthors: "+str(authors))
_debug("Author: "+author)
# Pattern for full author information search, based on first page author name:
authpat = make_authpat(aux['honor'], firstname, casefixname, aux['suffix'])
_debug("Authpat: " + authpat)
start = 0
col = None
# Find start of author info for this author (if any).
# Scan towards the front from the end of the file, looking for a match to authpath
for j in range(last_line, address_section_pos, -1):
line = self.lines[j]
_debug( "Line: " + line)
forms = [ line ] + [ line.replace(short, longform[short]) for short in longform if short in line ]
for form in forms:
try:
if re.search(authpat, form.strip()) and not j in found_pos:
_debug( "Match")
start = j
found_pos += [ start ]
_debug( " ==> start %s, normalized '%s'" % (start, form.strip()))
# The author info could be formatted in multiple columns...
columns = re.split("( +| and )", form)
# _debug( "Columns:" + str(columns))
# Find which column:
# _debug( "Col range:" + str(range(len(columns))))
# Pattern for full author information search, based on first page author name:
authpat = make_authpat(aux['honor'], left, right, aux['suffix'])
_debug("Authpat: " + authpat)
start = 0
col = None
# Find start of author info for this author (if any).
# Scan towards the front from the end of the file, looking for a match to authpath
for j in range(last_line, address_section_pos, -1):
line = self.lines[j]
_debug( "Line: " + line)
forms = [ line ] + [ line.replace(short, longform[short]) for short in longform if short in line ]
for form in forms:
try:
if re.search(authpat, form.strip()) and not j in found_pos:
_debug( "Match")
cols = [ c for c in range(len(columns)) if re.search(authpat+r"( and |, |$)", columns[c].strip()) ]
if cols:
col = cols[0]
if not (start, col) in found_pos:
found_pos += [ (start, col) ]
_debug( "Col: %d" % col)
beg = len("".join(columns[:col]))
_debug( "Beg: %d '%s'" % (beg, "".join(columns[:col])))
_debug( "Len: %d" % len(columns))
if col == len(columns) or col == len(columns)-1:
end = None
_debug( "End1: %s" % end)
else:
end = beg + len("".join(columns[col:col+2]))
_debug( "End2: %d '%s'" % (end, "".join(columns[col:col+2])))
_debug( "Cut: '%s'" % form[beg:end])
author_match = re.search(authpat, columns[col].strip()).group(1)
_debug( "AuthMatch: '%s'" % (author_match,))
if author_match in companies_seen:
companies[i] = authors[i]
authors[i] = None
else:
if casefixname in author_match:
fullname = author_match.replace(casefixname, surname)
start = j
found_pos += [ start ]
_debug( " ==> start %s, normalized '%s'" % (start, form.strip()))
# The author info could be formatted in multiple columns...
columns = re.split("( +| and )", form)
# _debug( "Columns:" + str(columns))
# Find which column:
# _debug( "Col range:" + str(range(len(columns))))
cols = [ c for c in range(len(columns)) if re.search(authpat+r"( and |, |$)", columns[c].strip()) ]
if cols:
col = cols[0]
if not (start, col) in found_pos:
found_pos += [ (start, col) ]
_debug( "Col: %d" % col)
beg = len("".join(columns[:col]))
_debug( "Beg: %d '%s'" % (beg, "".join(columns[:col])))
_debug( "Len: %d" % len(columns))
if col == len(columns) or col == len(columns)-1:
end = None
_debug( "End1: %s" % end)
else:
end = beg + len("".join(columns[col:col+2]))
_debug( "End2: %d '%s'" % (end, "".join(columns[col:col+2])))
_debug( "Cut: '%s'" % form[beg:end])
author_match = re.search(authpat, columns[col].strip()).group(1)
_debug( "AuthMatch: '%s'" % (author_match,))
if re.search('\(.*\)$', author_match.strip()):
author_match = author_match.rsplit('(',1)[0].strip()
if author_match in companies_seen:
companies[i] = authors[i]
authors[i] = None
else:
fullname = author_match
fullname = re.sub(" +", " ", fullname)
given_names, surname = fullname.rsplit(None, 1)
if " " in given_names:
first, middle = given_names.split(None, 1)
else:
first = given_names
middle = None
names = (first, middle, surname, suffix)
if suffix:
fullname = fullname+" "+suffix
if not " ".join([ n for n in names if n ]) == fullname:
_err("Author tuple doesn't match text in draft: %s, %s" % (authors[i], fullname))
authors[i] = (fullname, first, middle, surname, suffix)
companies[i] = None
#_debug( "Author: %s: %s" % (author_match, authors[author_match]))
break
except AssertionError, e:
sys.stderr.write("filename: "+self.filename+"\n")
sys.stderr.write("authpat: "+authpat+"\n")
raise
#if casefixname in author_match:
# fullname = author_match.replace(casefixname, surname)
#else:
# fullname = author_match
fullname = re.sub(" +", " ", fullname)
if left == firstname:
given_names, surname = fullname.rsplit(None, 1)
else:
surname, given_names = fullname.split(None, 1)
if " " in given_names:
first, middle = given_names.split(None, 1)
else:
first = given_names
middle = None
names = (first, middle, surname, suffix)
if suffix:
fullname = fullname+" "+suffix
parts = [ n for n in names if n ]
revpt = [ n for n in names if n ]
revpt.reverse()
if not ((" ".join(parts) == fullname) or (" ".join(revpt) == fullname)):
_err("Author tuple doesn't match text in draft: %s, %s" % (authors[i], fullname))
authors[i] = (fullname, first, middle, surname, suffix)
companies[i] = None
break
except AssertionError, e:
sys.stderr.write("filename: "+self.filename+"\n")
sys.stderr.write("authpat: "+authpat+"\n")
raise
if start and col != None:
break
if start and col != None:
break
if start and col != None:
break
# End for:
if not authors[i]:
continue
_debug("2: authors[%s]: %s" % (i, authors[i]))
@ -763,7 +778,7 @@ class Draft():
# for a in authors:
# if a and a not in companies_seen:
# _debug("Search for: %s"%(r"(^|\W)"+re.sub("\.? ", ".* ", a)+"(\W|$)"))
authmatch = [ a for a in authors[i+1:] if a and not a.lower() in companies_seen and (re.search((r"(?i)(^|\W)"+re.sub("\.? ", ".* ", a)+"(\W|$)"), line.strip()) or acronym_match(a, line.strip()) )]
authmatch = [ a for a in authors[i+1:] if a and not a.lower() in companies_seen and (re.search((r"(?i)(^|\W)"+re.sub("[. ]+", ".* ", a)+"(\W|$)"), line.strip()) or acronym_match(a, line.strip()) )]
if authmatch:
_debug(" ? Other author or company ? : %s" % authmatch)
_debug(" Line: "+line.strip())