parserec.py (9122B)
1 #'!/usr/bin/python3 2 3 import sys 4 5 stz = { 6 'submitted': { 7 'en': "Under review", 8 'fr': "Soumis au comité de lecture", 9 }, 10 'draft': { 11 'en': "Draft", 12 'fr': "Version préliminaire", 13 }, 14 'journalversion': { 15 'en': "journal version", 16 'fr': "version journal", 17 }, 18 'conferenceversion': { 19 'en': "conference version", 20 'fr': "version conférence", 21 }, 22 'journalversion_explain': { 23 'en': "Extended journal publication:", 24 'fr': "Version étendue correspondante :", 25 }, 26 'conferenceversion_explain': { 27 'en': "Extended version of the conference publication:", 28 'fr': "Version étendue de la publication conférence :", 29 }, 30 'slides': { 31 'en': "slides", 32 'fr': "exposé", 33 }, 34 'slideslong': { 35 'en': "longer slides", 36 'fr': "exposé plus détaillé", 37 }, 38 'conference': { 39 'en': "conference", 40 'fr': "une conférence", 41 }, 42 'workshop': { 43 'en': "workshop", 44 'fr': "un workshop", 45 }, 46 'journal': { 47 'en': "journal", 48 'fr': "une revue", 49 }, 50 'published at non oa': { 51 'en': "Published at a closed-access %s", 52 'fr': "Publié dans %s non accessible en libre accès", 53 }, 54 'oaexplain': { 55 'en': "[why?]", 56 'fr': "[explications (en anglais)]", 57 }, 58 'poster': { 59 'en': "poster", 60 'fr': "poster", 61 }, 62 'video': { 63 'en': "video", 64 'fr': "vidéo", 65 }, 66 'videoin': { 67 'en': "video in", 68 'fr': "vidéo en", 69 }, 70 'videoon': { 71 'en': "video on", 72 'fr': "vidéo sur", 73 }, 74 'direct download': { 75 'en': "direct download", 76 'fr': "téléchargement direct", 77 }, 78 'on': { 79 'en': "on", 80 'fr': "sur", 81 }, 82 'by': { 83 'en': "by", 84 'fr': "par", 85 }, 86 'oron': { 87 'en': "or on", 88 'fr': "ou sur", 89 }, 90 'orin': { 91 'en': "or in", 92 'fr': "ou en", 93 }, 94 'short': { 95 'en': "lightning talk", 96 'fr': "exposé bref", 97 }, 98 'code': { 99 'en': "code", 100 'fr': "code", 101 }, 102 'phddefense': { 103 'en': "PhD defense", 104 'fr': "soutenance de thèse", 105 }, 106 'habilitationdefense': { 107 'en': "Habilitation defense", 108 'fr': "soutenance d'habilitation à diriger des recherches", 109 }, 110 'phddefenserehearsal': { 111 'en': "PhD defense rehearsal", 112 'fr': "répétition de soutenance de thèse", 113 }, 114 'habilitationthesis': { 115 'en': "Habilitation thesis", 116 'fr': "Manuscrit d'habilitation à diriger des recherches", 117 }, 118 'and': { 119 'en': "and", 120 'fr': "et", 121 }, 122 'demo': { 123 'en': "Demo paper", 124 'fr': "Démonstration", 125 }, 126 'spotlight': { 127 'en': "Spotlight presentation", 128 'fr': "Exposé spotlight", 129 }, 130 'shortpaper': { 131 'en': "Short paper", 132 'fr': "Article court", 133 }, 134 'posterpaper': { 135 'en': "Poster paper", 136 'fr': "Article poster", 137 }, 138 } 139 140 stopwords = ["at", "au", "du", "at the", "of the", "for project", "du projet"] 141 142 talk_types = ['poster', 'short', 'phddefenserehearsal'] 143 144 def authorname(author, sepnames=False): 145 if 'name' in author.keys(): 146 return author['name'] 147 else: 148 if sepnames: 149 return author['lastname'] + ', ' + author['firstname'] 150 else: 151 return author['firstname'] + ' ' + author['lastname'] 152 153 def isurlrel(url): 154 # is URL relative? 155 if url.startswith('http'): 156 return False 157 if url.startswith('/'): 158 return False 159 return True 160 161 def absurl(url, site, local): 162 if url.startswith('http'): 163 return url 164 if url.startswith('/'): 165 return site + url 166 return local + url 167 168 def getyear(publi): 169 if 'year' in publi.keys(): 170 return int(publi['year']) 171 # guess a year 172 pos = 0 173 title = publi['id'] 174 while not title[pos].isdigit(): 175 pos += 1 176 return int(title[pos:pos+4]) 177 178 def endswithpunct(publi): 179 for a in [".", "?", "!"]: 180 if (publi['title'].endswith(a)): 181 return True 182 return False 183 184 def mkvenuename(venueo, venuesz, short=False, year=True): 185 venue = venueo['id'] 186 venue_prevo = None 187 last = venue[-4:] 188 if last.isdigit(): 189 venue_prev = venue[:-4] 190 if venue_prev in venuesz.keys(): 191 venue_prevo = venuesz[venue_prev] 192 if 'name' in venueo.keys(): 193 return venueo['name'] 194 else: 195 # make name from id 196 pos = 0 197 while not venue[pos].isdigit(): 198 pos += 1 199 sep = ' ' 200 lpos = pos 201 if short and pos < len(venue) + 1: 202 # skip parts of year 203 pos += 2 204 sep = "'" 205 prename = venue[:lpos].upper() 206 if venue_prevo and 'name' in venue_prevo.keys(): 207 prename = venue_prevo['name'] 208 if year: 209 return prename + sep + venue[pos:] 210 else: 211 return prename 212 213 # return name, fullname, type, venue URL, issue: for publi in lang given venuesz 214 def getvenue(publi, lang, venuesz, short=False): 215 global stz 216 global talk_types 217 if 'venue' not in publi.keys(): 218 if 'status' in publi.keys(): 219 name = stz[publi['status']][lang] 220 return (name, name, publi['status'], '', '', '', set()) 221 return ('', '', '', '', '', '', set()) # phdthesis or mscthesis or habilitationthesis 222 venue = publi['venue'] 223 venueid = venue 224 if short and 'venueshort' in publi.keys(): 225 venue = publi['venueshort'] 226 url = None 227 typ = None 228 keywords = set() 229 oa = None 230 fullname = None 231 venue_no_year = venue 232 found = False 233 if venue in venuesz.keys(): 234 venueo = venuesz[venue] 235 if 'oa' not in venueo.keys(): 236 print("missing OA info for %s" % venue, file=sys.stderr) 237 assert(False) 238 oa = venueo['oa'] 239 if 'audience' in venueo.keys(): 240 assert (venueo['audience'] in ['national', 'international']) 241 keywords.add(venueo['audience']) 242 if 'informal' in venueo.keys(): 243 assert (venueo['informal'] == 'yes') 244 keywords.add("informal") 245 else: 246 keywords.add("formal") 247 if 'type' in venueo.keys(): 248 assert (venueo['type'] in ['school', 'conference', 'workshop', 249 'journal', 'book']) 250 if venueo['type'] in ['conference', 'journal']: 251 typ = venueo['type'] 252 # the book I have isn't really a book 253 keywords.add('TYPE' + (venueo['type'] if venueo['type'] != 'book' 254 else 'conference')) 255 if 'url' in venueo.keys(): 256 url = venueo['url'] 257 if 'fullname' in venueo.keys(): 258 fullname = venueo['fullname'] 259 venue = mkvenuename(venueo, venuesz, short) 260 venue_no_year = mkvenuename(venueo, venuesz, short, year=False) 261 if fullname and venue: 262 fullname += " (" + venue + ")" 263 else: 264 if 'venueurl' in publi.keys(): 265 url = publi['venueurl'] 266 # the venue is given directly (deprecated, used for talks and some special types) 267 assert('type' not in publi.keys() or publi['type'] in ['patent', 'mscthesis', 'phdthesis', 'habilitationthesis', 'note'] + talk_types) 268 oa = True 269 lvenue = 'venue' + lang 270 if lvenue in publi.keys(): 271 venue = publi[lvenue] 272 if fullname == None: 273 fullname = venue 274 last = venueid[-4:] 275 if last.isdigit(): 276 # conf2042 => conf 277 venue_prev = venueid[:-4] 278 if venue_prev in venuesz.keys(): 279 if 'fullname' in venuesz[venue_prev].keys(): 280 fullname = venuesz[venue_prev]['fullname'] + " (" + venue_no_year + ")" 281 282 return (venue, fullname, typ, url, publi.get('issue', ''), oa in ['yes', 'n/a'], keywords) 283 284 def parse(fname): 285 with open(fname, 'r') as f: 286 current = {} 287 for l in f.readlines(): 288 if l.strip().startswith('#'): 289 continue 290 if len(l.strip()) == 0: 291 if len(current.keys()) > 0: 292 yield current 293 current = {} 294 continue 295 fields = l.strip().split(':') 296 fname = fields[0].lower() 297 fval = ':'.join(fields[1:]) 298 current[fname.strip().lower()] = fval.strip() 299 if len(current.keys()) > 0: 300 yield current 301