Generic item import⚓︎

We cannot implement custom types for any kind of data source. Also, often there is no real benefit in doing so anyway. To that end, the platform supports the "generic" item type which only consists of text and arbitrary fields in a jsonb object.

You can create a project for item type "generic" and adapt the following script to import your dataset. All metadata you want to store in the database goes in the meta field. Please be conservative with adding data here and be kind to our database and think about how much space it needs.

By default, all fields in the meta column will be shown in the interface. You can prefix the field key with _ to hide it in the interface.

Script

import json
import uuid
import logging
import datetime
from pathlib import Path

from sqlalchemy import insert

from nacsos_data.db.schemas import GenericItem, m2m_import_item_table, Import
from nacsos_data.db import get_engine
from nacsos_data.models.imports import M2MImportItemType

db_engine = get_engine(conf_file='/path/to/config.env')

logging.basicConfig(format='%(asctime)s [%(levelname)s] %(name)s: %(message)s', level=logging.INFO)
logger = logging.getLogger('import')
logger.setLevel(logging.DEBUG)

SOURCE_FILE = Path('/home/tim/Downloads/all_grants_cdr_ta_only_deduplicated.jsonl')

# The project the generic data should be imported into
PROJECT_ID = 'EDIT ME'
# The user the import will be associated with
USER_ID = 'EDIT ME'
# Descriptive name for the import
IMPORT_NAME = 'EXAMPLE | EDIT ME'
# Additional description for the import
DESCRIPTION = 'EXAMPLE | EDIT ME'

# Fields to include in the database
# ... shown in the interface
FIELDS_VISIBLE = {
    'id': 'dimensions_id',
    'title': 'Title',
    'funding_usd': 'Funding (USD)',
    'query_combined': 'CDR Queries',
    'start_year': 'Year (started)',
    'linkout': 'URL'
}
# ... not shown in the interface
FIELDS_HIDDEN = [
    'active_year', 'category_for', 'category_for_2020', 'category_sdg', 'category_uoa', 'concepts', 'concepts_scores',
    'date_inserted', 'dimensions_url', 'end_date', 'foa_number', 'funder_org_cities', 'funder_org_countries',
    'funder_org_name', 'funder_org_states', 'funder_orgs', 'funding_aud', 'funding_cad', 'funding_chf', 'funding_cny',
    'funding_currency', 'funding_eur', 'funding_gbp', 'funding_jpy', 'funding_nzd', 'funding_schemes', 'funding_usd',
    'investigators', 'language', 'language_title', 'original_title', 'project_numbers',
    'research_org_cities', 'research_org_countries', 'research_org_names', 'research_org_state_codes', 'researchers',
    'score', 'start_date', 'start_year', 'funder_org_acronym', 'keywords', 'research_orgs', 'category_rcdc',
    'category_hrcs_hc', 'category_hrcs_rac', 'category_hra', 'category_bra', 'category_icrp_cso', 'category_icrp_ct'
]
# Get the text from these fields and merge them
TEXT_FIELDS = ['abstract']

# Import ID leave None to create new import, otherwise set ID of existing one
IMPORT_ID = None

if not SOURCE_FILE.exists():
    raise FileNotFoundError(f'File doesn\'t exist: {SOURCE_FILE}')

logger.info(f'Going to include the following fields:')
for vkey, key in FIELDS_VISIBLE.items():
    logger.info(f'  - {key} -> {vkey}')
logger.info(f'Also including these keys, but they won\' be visible in the interface:')
for key in FIELDS_HIDDEN:
    logger.info(f'  - {key} -> _{key}')
logger.info(f'Going to read the text from {TEXT_FIELDS}')

with db_engine.session() as session:
    if IMPORT_ID is None:
        IMPORT_ID = str(uuid.uuid4())
        logger.info(f'Creating new import with id={IMPORT_ID}')
        import_orm: Import = Import(
            project_id=PROJECT_ID,
            user_id=USER_ID,
            import_id=IMPORT_ID,
            name=IMPORT_NAME,
            description=DESCRIPTION,
            type='script',
            time_created=datetime.datetime.now(),
            time_started=datetime.datetime.now(),
            time_finished=datetime.datetime.now()
        )
        session.add(import_orm)
        session.commit()

    logger.info(f'Reading data from {SOURCE_FILE}')
    with open(SOURCE_FILE, 'r') as f:
        for li, line in enumerate(f):
            src = json.loads(line)
            item_id = str(uuid.uuid4())
            logger.debug(f'{li} | Creating item with item_id={item_id}')
            logger.debug(src)
            text = [src.get(key, f'[MISSING {key}]') for key in TEXT_FIELDS]
            item = GenericItem(
                item_id=item_id,
                project_id=PROJECT_ID,
                text='\n\n'.join([t for t in text if t is not None]),
                type='generic',
                meta={
                    **{
                        vkey: src.get(key)
                        for key, vkey in FIELDS_VISIBLE.items()
                    },
                    **{
                        f'_{key}': src.get(key)
                        for key in FIELDS_HIDDEN
                    }
                }
            )
            session.add(item)
            session.commit()

            stmt_m2m = (insert(m2m_import_item_table)
                        .values(item_id=item_id, import_id=IMPORT_ID, type=M2MImportItemType.explicit))
            session.execute(stmt_m2m)
logger.info('All done!')

The example import research grant abstracts from dimensions, which will look like this on the platform: