Skip to content
Snippets Groups Projects
Commit 7237a9f8 authored by Guilhem Saurel's avatar Guilhem Saurel
Browse files

check invalid entries

parent a8e3388e
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3 #!/usr/bin/env python3
from concurrent.futures import ThreadPoolExecutor
import re import re
from pathlib import Path from pathlib import Path
from bibtexparser import load, loads from bibtexparser import load, loads
from bibtexparser.bibdatabase import BibDatabase from bibtexparser.bibdatabase import BibDatabase
from requests import get from requests import get
from requests_futures.sessions import FuturesSession
HAL_RE = [ HAL_RE = [
...@@ -14,9 +16,11 @@ HAL_RE = [ ...@@ -14,9 +16,11 @@ HAL_RE = [
(re.compile(r'/\d\d/\d\d/\d\d/\d\d/'), (re.compile(r'/\d\d/\d\d/\d\d/\d\d/'),
lambda g: 'hal-%08i' % int(g.replace('/', ''))), lambda g: 'hal-%08i' % int(g.replace('/', ''))),
] ]
HAL_URL = 'https://hal.archives-ouvertes.fr/%s/bibtex'
HAL_KEYS = ['url', 'link', 'pdf', 'video'] HAL_KEYS = ['url', 'link', 'pdf', 'video']
HAL_DICT = {} HAL_DICT = {}
USELESS_KEYS = {'hal_local_reference', 'hal_version', 'address', 'note', 'month'} USELESS_KEYS = {'hal_local_reference', 'hal_version', 'address', 'note', 'month'}
GEPETTO_URL = 'http://projects.laas.fr/gepetto/index.php/Publications/BibtexEntry?bibtex=%s'
TEAM_NAMES = { TEAM_NAMES = {
'ad': ['del prete'], 'ad': ['del prete'],
...@@ -54,7 +58,7 @@ def get_hal_entry(hal_id, hal_db): ...@@ -54,7 +58,7 @@ def get_hal_entry(hal_id, hal_db):
for key in hal_db.entries_dict.keys(): for key in hal_db.entries_dict.keys():
if key.endswith(hal_id): if key.endswith(hal_id):
return hal_db.entries_dict[key] return hal_db.entries_dict[key]
url = 'https://hal.archives-ouvertes.fr/%s/bibtex' % hal_id url = HAL_URL % hal_id
r = get(url) r = get(url)
r.raise_for_status() r.raise_for_status()
if 'Aucun document trouvé' in r.content.decode(): if 'Aucun document trouvé' in r.content.decode():
...@@ -107,6 +111,13 @@ if __name__ == '__main__': ...@@ -107,6 +111,13 @@ if __name__ == '__main__':
for initials, names in TEAM_NAMES.items(): for initials, names in TEAM_NAMES.items():
if not_in_dbs[initials]: if not_in_dbs[initials]:
header(names[0]) header(names[0])
with open('diffs/%s.txt' % initials, 'w') as f: for url, title in not_in_dbs[initials]:
for url, title in not_in_dbs[initials]: print('IN HAL', url, title)
print(url, title, file=f) urls = [GEPETTO_URL % entry['ID'] for key in dbs.keys() for entry in dbs[key].entries]
session = FuturesSession(executor=ThreadPoolExecutor(max_workers=10))
futures = [session.get(u) for u in urls]
toto = None
for future in futures:
response = future.result()
if b'Invalid bibtex entry' in response.content:
print('INVALID', response.url)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment