Skip to content

Retrospective deduplication⚓︎

Presume you have a project where you notice that the deduplication wasn't so great and you now have something better. Then you'd need to update all references when something is deleted. This is not trivial and not a feature we support. However, you might use the following script for inspiration, it did work once...

import uuid
import logging
from typing import Any, TYPE_CHECKING
from sqlalchemy import update, delete, select

from ..db import DatabaseEngineAsync
from ..db.schemas import \
    Assignment, \
    Annotation, \
    BotAnnotationMetaData, \
    BotAnnotation, \
    m2m_import_item_table
from ..models.bot_annotations import BotKind

if TYPE_CHECKING:
    from sqlalchemy.ext.asyncio import AsyncSession  # noqa: F401

logger = logging.getLogger('nacsos_data.util.deduplicate')


async def update_references(old_item_id: str | uuid.UUID,
                            new_item_id: str | uuid.UUID,
                            db_engine: DatabaseEngineAsync) -> None:
    """
    This function can be used in case where an item_id changes and
    all references in the database need to be updated accordingly.
    The most common scenario for this is for deduplication after detecting a pair of
    duplicates, merging it into one item and having to update references for
    assignments, annotations, imports, etc.

    Note: The AcademicItem (or TwitterItem,...) with `new_item_id` has to exist in the database.
    Note: This function does not delete the item for `old_item_id`

    :param old_item_id:
    :param new_item_id:
    :param db_engine:
    :return:
    """
    raise DeprecationWarning('The metadata in BotAnnotationMetaData changed for resolutions; Function needs updating!')
    # No updates needed in
    #  - annotation_scheme
    #  - assignment_scope
    #  - auth_tokens
    #  - highlighters
    #  - import
    #  - project
    #  - project_permissions
    #  - user
    #
    # Not performing updates in
    #  - academic_item
    #  - twitter_item
    #  - generic_item
    #  - item
    #
    # Might need updates in `tasks`, but it's too painful for little gain.
    # May need to be done in the future, but not important for now.

    async with db_engine.session() as session:  # type: AsyncSession # type: ignore[unreachable]

        # Point Annotations to new Item
        n_annotations = await session.execute(
            update(Annotation)
            .where(Annotation.item_id == old_item_id)
            .values(item_id=new_item_id)
            .returning(Annotation.annotation_id)
        )

        # Point Assignments to new Item
        n_assignments = await session.execute(
            update(Assignment)
            .where(Assignment.item_id == old_item_id)
            .values(item_id=new_item_id)
        )

        # Point BotAnnotations to new Item
        n_bot_annotations = await session.execute(
            update(BotAnnotation)
            .where(BotAnnotation.item_id == old_item_id)
            .values(item_id=new_item_id)
        )

        # Rewire Import many-to-many relation
        # first, drop all references that we will create in a second anyway
        n_m2m_del = await session.execute(
            delete(m2m_import_item_table)
            .where(m2m_import_item_table.c.item_id == new_item_id)
        )
        # now, update all m2m relations for the old item
        n_m2m2_upd = await session.execute(
            update(m2m_import_item_table)
            .where(m2m_import_item_table.c.item_id == old_item_id)
            .values(item_id=new_item_id)
        )

        # We do store some background information in the meta-data for label resolutions, incl item_ids
        bot_annotation_scopes = (
            await session.execute(select(BotAnnotationMetaData)
                                  .where(BotAnnotationMetaData.kind == BotKind.RESOLVE))
        ).scalars().all()
        n_ba_scopes = 0
        for bot_anno_scope in bot_annotation_scopes:
            meta: dict[str, Any] = bot_anno_scope.meta  # type: ignore[assignment] # dict of type BotMetaResolve

            if meta is not None and str(old_item_id) in meta['collection']['annotations']:
                # update `item_ids` in `AnnotationCollection` entries if necessary
                for aci, collections in enumerate(meta['collection']['annotations'][str(old_item_id)]):
                    for ci, collection in enumerate(collections):
                        # collection[0] is always the path (e.g. recursive keys based on parent structure)
                        for ai, anno in enumerate(collection[1]):
                            if str(anno['item_id']) == str(old_item_id):
                                meta['collection']['annotations'][str(old_item_id)][aci][ci][1][ai]['item_id'] = str(
                                    new_item_id)

                # We already have something for the new `item_id`, so merge collections
                # Even though there might be "duplicate" labels now (same user,item pairs), we keep them all!
                if str(new_item_id) in meta['collection']['annotations']:
                    meta['collection']['annotations'][str(new_item_id)] += meta['collection']['annotations'][
                        str(old_item_id)]
                    del meta['collection']['annotations'][str(old_item_id)]

                # otherwise, just move and delete the old one
                else:
                    meta['collection']['annotations'][str(new_item_id)] = meta['collection']['annotations'][
                        str(old_item_id)]
                    del meta['collection']['annotations'][str(old_item_id)]

                n_ba_scopes += 1
                bot_anno_scope.meta = meta  # type: ignore[assignment]
                await session.commit()

        logger.debug(f'Updated references "{old_item_id}"->"{new_item_id}": '
                     f'{n_annotations.rowcount} annotations affected, '  # type: ignore[attr-defined]
                     f'{n_assignments.rowcount} assignments affected, '  # type: ignore[attr-defined]
                     f'{n_bot_annotations.rowcount} bot_annotations affected, '  # type: ignore[attr-defined]
                     f'{n_m2m_del.rowcount} import_m2m entries deleted, '  # type: ignore[attr-defined]
                     f'{n_m2m2_upd.rowcount} import_m2m entries updated, '  # type: ignore[attr-defined]
                     f'{n_ba_scopes} bot_annotation_metadata for RESOLVE updated')