Project

Profile

Help

Issue #9107 ยป repair.py

dalley, 08/03/2021 01:18 AM

 
import asyncio
import logging
import shutil
import sys
import tempfile

import createrepo_c as cr

import django
django.setup()

from django.core.files.storage import default_storage as storage

from pulpcore.plugin.models import (
ContentArtifact,
RepositoryVersion,
)
from pulpcore.tasking.util import get_url
from pulp_rpm.app.models import (
Package,
RpmPublication,
RpmRemote,
RpmRepository,
RpmDistribution,
)

log = logging.getLogger(__name__)
loop = asyncio.get_event_loop()

broken_criteria = {
"files": [],
"changelogs": [],
}
broken_packages = Package.objects.filter(**broken_criteria)
broken_package_ids = list(broken_packages.values_list("pk", flat=True))

print("""
##########################################
# Repair tool for Pulp RPM issue #9107 #
# https://pulp.plan.io/issues/9107 #
##########################################""")

print(
"""
Analysis
========
"""
)

total_packages = Package.objects.all().count()
total_broken_packages = broken_packages.count()
print("Total packages with broken metadata: {}/{}".format(total_broken_packages, total_packages))

broken_content_artifacts = ContentArtifact.objects.filter(content__in=broken_packages)
assert broken_content_artifacts.count() == total_broken_packages

broken_content_artifacts_with_files = broken_content_artifacts.filter(artifact__isnull=False)
broken_content_artifacts_without_files = broken_content_artifacts.filter(artifact__isnull=True)

total_broken_packages_with_artifacts = broken_content_artifacts_with_files.count()
print("Total packages with broken metadata, with local RPMs available: {}/{}".format(total_broken_packages_with_artifacts, total_broken_packages))

total_broken_packages_download_needed = total_broken_packages - total_broken_packages_with_artifacts
print("Total packages with broken metadata, without local RPMs available: {}/{}".format(total_broken_packages_download_needed, total_broken_packages))

print(
"""
Repair
======
"""
)

total_packages_repaired = 0
total_packages_unable_to_repair = 0


def update_total():
global total_packages_repaired
global total_broken_packages
sys.stdout.write(
"\rPackage metadata repaired: {}/{}".format(total_packages_repaired, total_broken_packages)
)
sys.stdout.flush()

# Repair on-disk Packages
# =======================


packages_to_update = []
for ca in broken_content_artifacts_with_files.select_related("artifact", "content").iterator():
with storage.open(ca.artifact.file.name) as fp:
with tempfile.NamedTemporaryFile("wb", suffix="blah.rpm") as temp_file:
shutil.copyfileobj(fp, temp_file)
temp_file.flush()
cr_pkginfo = cr.package_from_rpm(temp_file.name)

new_package = Package.createrepo_to_dict(cr_pkginfo)
old_package = ca.content.rpm_package # not very efficient, but for a one-time script it's "fine".

old_package.files = new_package["files"]
old_package.changelogs = new_package["changelogs"]

packages_to_update.append(old_package)

if len(packages_to_update) >= 50:
Package.objects.bulk_update(packages_to_update, fields=["files", "changelogs"])
total_packages_repaired += len(packages_to_update)
packages_to_update.clear()
update_total()
# not worth cleaning up the duplicated code
else:
Package.objects.bulk_update(packages_to_update, fields=["files", "changelogs"])
total_packages_repaired += len(packages_to_update)
packages_to_update.clear()
update_total()

# Repair on-demand Packages
# =========================


async def repair_on_demand_content(content, remote_artifacts):
import aiofiles
import aiofiles.os
import createrepo_c as cr
from pulp_rpm.app.models import Package

async with aiofiles.tempfile.TemporaryDirectory():
for remote_artifact in remote_artifacts:
remote = remote_artifact.remote
downloader = remote.get_downloader(remote_artifact=remote_artifact)
try:
download_result = await downloader.run()
except Exception:
continue

cr_pkginfo = cr.package_from_rpm(download_result.path)
new_package = Package.createrepo_to_dict(cr_pkginfo)
old_package = content.rpm_package # not very efficient, but for a one-time script it's "fine".
await aiofiles.os.remove(download_result.path)

old_package.files = new_package["files"]
old_package.changelogs = new_package["changelogs"]

return old_package

packages_to_update = []

for ca in broken_content_artifacts_without_files.select_related("content").iterator():
remote_artifacts = list(ca.remoteartifact_set.all().select_related("remote"))
package = repair_on_demand_content(ca.content, remote_artifacts)
packages_to_update.append(package)
if len(packages_to_update) >= 10:
packages_to_update = loop.run_until_complete(
asyncio.gather(*packages_to_update, return_exceptions=True)
)
filtered_packages = [
pkg for pkg in packages_to_update
if pkg is not None and not isinstance(pkg, Exception)
]

Package.objects.bulk_update(filtered_packages, fields=["files", "changelogs"])
total_packages_repaired += len(filtered_packages)
total_packages_unable_to_repair += (len(packages_to_update) - len(filtered_packages))
packages_to_update.clear()
update_total()
# not worth cleaning up the duplicated code
else:
packages_to_update = loop.run_until_complete(
asyncio.gather(*packages_to_update, return_exceptions=True)
)
filtered_packages = [
pkg for pkg in packages_to_update
if pkg is not None and not isinstance(pkg, Exception)
]

Package.objects.bulk_update(filtered_packages, fields=["files", "changelogs"])
total_packages_repaired += len(filtered_packages)
total_packages_unable_to_repair += (len(packages_to_update) - len(filtered_packages))
packages_to_update.clear()
update_total()

print(" ...FINISHED")
if total_packages_unable_to_repair:
print("Unable to repair metadata for {} packages due to download errors.\n".format(total_packages_unable_to_repair))
else:
print()

print("""NOTE: Be aware that this script may persistently falsely identify some packages
as having broken metadata, when they do not. This is expected and not a
problem. If you were able to successfully execute this script but still
see some constant number of packages reported as broken then these are
likely to be false positives.\n""")

if not total_packages_repaired:
print("No further actions required.")
else:
repo_versions = RepositoryVersion.objects.filter(
publication__in=RpmPublication.objects.filter(
complete=True,
repository_version__in=RepositoryVersion.objects.with_content(
Package.objects.filter(pk__in=broken_package_ids)
)
)
)

repo_versions = list(repo_versions)

if not repo_versions:
print("No further actions required.")
else:
print(
"2) Please save the following information for future reference as it cannot be "
" reproduced by a re-run of this script."
)

print(
"The following publications (potentially) contain incorrect metadata, they can be\n"
"deleted with the pulp cli like so \"pulp rpm publication destroy --href $(url fragment)\"\n"
)

for version in repo_versions:
for publication in RpmPublication.objects.filter(repository_version=version, complete=True).iterator():
print("\t {}".format(get_url(publication)))

print(
(
"\t\t Created from repository '{name}' version {v}\n"
"\t\t\t /pulp/api/v3/repositories/rpm/rpm/{pk}/versions/{v}/\n"
).format(name=version.repository.name, pk=str(version.repository.pk), v=version.number)
)

distributions = list(RpmDistribution.objects.filter(publication=publication))
if distributions:
print("\t\t Currently being (directly) distributed by:")
for distribution in distributions:
print("\t\t\t {}".format(get_url(distribution)))
    (1-1/1)