Skip to content

Running many imports in series⚓︎

There is no elegant way to run many imports other than starting them one after another. Doing so by hand via the web interface can become cumbersome.

Assuming you created all imports (not initiated yet!) and uploaded the files, the following script will run through them and start each one.

import logging
import asyncio
from pathlib import Path

from sqlalchemy import select

from nacsos_data.db import get_engine_async as get_engine
from nacsos_data.db.crud.imports import read_all_imports_for_project
from nacsos_data.util.academic.wos import read_wos_file
from nacsos_data.util.academic.scopus import read_scopus_csv_file
from nacsos_data.util.academic.importer import import_academic_items

logging.basicConfig(filename='rwi_reimport.log',
                    format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
                    level=logging.INFO)
logger = logging.getLogger('reimport')
logger.setLevel(logging.INFO)

PROJECT_ID = '[PROJECT_ID]'
PATH = Path('/var/www/nacsos2/.data/user_data')


async def main() -> None:
    db_engine = get_engine(conf_file='/path/to/config.env')

    imports = await read_all_imports_for_project(db_engine=db_engine, project_id=PROJECT_ID)
    for imp in imports:
        if imp.config.kind == 'scopus':
            works = lambda: read_scopus_csv_file(filepath=str(PATH / imp.config.sources[0]),
                                                 project_id=PROJECT_ID)
        elif imp.config.kind == 'wos':
            works = lambda: read_wos_file(filepath=str(PATH / imp.config.sources[0]),
                                          project_id=PROJECT_ID)
        else:
            raise ValueError(f'Unknown kind {imp.config.kind}')
        ilog = logging.getLogger(f'{imp.name}')
        await import_academic_items(db_engine=db_engine,
                                    project_id=PROJECT_ID,
                                    import_id=imp.import_id,
                                    new_items=works,
                                    dry_run=False,
                                    trust_new_authors=True,
                                    trust_new_keywords=True,
                                    log=ilog)


if __name__ == '__main__':
    asyncio.run(main())
Cleaning all (partial) existing imports

This is very dangerous. Only do this if you are absolutely sure what you are doing and have full knowledge of all implications this has!

DELETE
FROM academic_item
WHERE project_id = '[PROJECT_ID]';

DELETE
FROM academic_item_variant aiv
    USING academic_item ai
WHERE aiv.item_id = ai.item_id
  AND ai.project_id = '[PROJECT_ID]';

DELETE
FROM m2m_import_item m2m
    USING item i
WHERE m2m.item_id = i.item_id
  AND i.project_id = '[PROJECT_ID]';

DELETE
FROM tasks
WHERE project_id = '[PROJECT_ID]';

DELETE
FROM item
WHERE project_id = '[PROJECT_ID]';

UPDATE import
SET pipeline_task_id=NULL, time_started=NULL, time_finished=NULL
WHERE project_id = '[PROJECT_ID]';