Skip to content

Commit

Permalink
Merge pull request #135 from nexB/api-update
Browse files Browse the repository at this point in the history
Api update
  • Loading branch information
JonoYang authored Jun 29, 2023
2 parents 2b509c7 + 798dd87 commit ea522de
Show file tree
Hide file tree
Showing 22 changed files with 264 additions and 124 deletions.
10 changes: 10 additions & 0 deletions matchcode-toolkit/CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
Changelog
=========

v1.1.1
------

*2023-06-29* -- Do not include empty files when computing directory fingerprints.

v1.1.0
------

*2023-06-22* -- Rename ``compute_directory_fingerprints`` to ``compute_codebase_directory_fingerprints`` and create a new version of ``compute_directory_fingerprints`` that works on Resource objects instead of codebases.

v1.0.0
------

Expand Down
2 changes: 1 addition & 1 deletion matchcode-toolkit/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "matchcode-toolkit"
version = "1.0.0"
version = "1.1.1"

[build-system]
requires = ["setuptools >= 50", "wheel", "setuptools_scm[toml] >= 6"]
Expand Down
2 changes: 1 addition & 1 deletion matchcode-toolkit/setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = matchcode-toolkit
version = 1.0.0
version = 1.1.1
license = Apache-2.0

# description must be on ONE line https://github.com/pypa/setuptools/issues/1390
Expand Down
58 changes: 39 additions & 19 deletions matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,30 +69,50 @@ def create_structure_fingerprint(directory, children):
return _create_directory_fingerprint(features)


def compute_directory_fingerprints(codebase):
def _compute_directory_fingerprints(directory, codebase):
"""
Compute fingerprints for a directory from `codebase`
Compute fingerprints for `directory` from `codebase`
"""
for resource in codebase.walk(topdown=False):
if resource.is_file or not resource.path:
continue
children = [r for r in resource.walk(codebase) if r.is_file]
if len(children) == 1:
continue
# We do not want to add empty files to our fingerprint
children = [r for r in directory.walk(codebase) if r.is_file and r.size]
if len(children) == 1:
return

directory_content_fingerprint = create_content_fingerprint(children)
if hasattr(resource, 'directory_content_fingerprint'):
resource.directory_content_fingerprint = directory_content_fingerprint
else:
resource.extra_data['directory_content'] = directory_content_fingerprint
directory_content_fingerprint = create_content_fingerprint(children)
if hasattr(directory, 'directory_content_fingerprint'):
directory.directory_content_fingerprint = directory_content_fingerprint
else:
directory.extra_data['directory_content'] = directory_content_fingerprint

directory_structure_fingerprint = create_structure_fingerprint(resource, children)
if hasattr(resource, 'directory_structure_fingerprint'):
resource.directory_structure_fingerprint = directory_structure_fingerprint
else:
resource.extra_data['directory_structure'] = create_structure_fingerprint(resource, children)
directory_structure_fingerprint = create_structure_fingerprint(directory, children)
if hasattr(directory, 'directory_structure_fingerprint'):
directory.directory_structure_fingerprint = directory_structure_fingerprint
else:
directory.extra_data['directory_structure'] = directory_structure_fingerprint

directory.save(codebase)
return directory


def compute_directory_fingerprints(directory, codebase):
"""
Recursivly compute fingerprints for `directory` from `codebase`
"""
for resource in directory.walk(codebase, topdown=False):
if resource.is_file:
continue
_ = _compute_directory_fingerprints(resource, codebase)
return directory

resource.save(codebase)

def compute_codebase_directory_fingerprints(codebase):
"""
Compute fingerprints for directories from `codebase`
"""
for resource in codebase.walk(topdown=False):
if resource.is_file or not resource.path:
continue
_ = _compute_directory_fingerprints(resource, codebase)
return codebase


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode.io for support and download.

from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints

from scanpipe.pipelines.scan_package import ScanPackage
from scanpipe.pipes.codebase import ProjectCodebase
Expand Down Expand Up @@ -63,4 +63,4 @@ def fingerprint_codebase(self):
Compute directory fingerprints for matching purposes
"""
project_codebase = ProjectCodebase(self.project)
compute_directory_fingerprints(project_codebase)
compute_codebase_directory_fingerprints(project_codebase)
4 changes: 2 additions & 2 deletions matchcode-toolkit/src/matchcode_toolkit/plugin_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from commoncode.cliutils import PluggableCommandLineOption
from commoncode.cliutils import POST_SCAN_GROUP
from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from plugincode.post_scan import post_scan_impl
from plugincode.post_scan import PostScanPlugin

Expand Down Expand Up @@ -41,4 +41,4 @@ def is_enabled(self, fingerprint, **kwargs):
return fingerprint

def process_codebase(self, codebase, **kwargs):
codebase = compute_directory_fingerprints(codebase)
codebase = compute_codebase_directory_fingerprints(codebase)
6 changes: 3 additions & 3 deletions matchcode-toolkit/tests/test_fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from matchcode_toolkit.fingerprinting import _create_directory_fingerprint
from matchcode_toolkit.fingerprinting import _get_resource_subpath
from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode_toolkit.fingerprinting import create_content_fingerprint
from matchcode_toolkit.fingerprinting import create_halohash_chunks
from matchcode_toolkit.fingerprinting import create_structure_fingerprint
Expand Down Expand Up @@ -95,10 +95,10 @@ def test_create_halohash_chunks(self):
self.assertEqual(chunk3, expected_chunk3)
self.assertEqual(chunk4, expected_chunk4)

def test_compute_directory_fingerprints(self):
def test_compute_codebase_directory_fingerprints(self):
scan_loc = self.get_test_loc('abbrev-1.0.3-i.json')
vc = VirtualCodebase(location=scan_loc)
vc = compute_directory_fingerprints(vc)
vc = compute_codebase_directory_fingerprints(vc)
directory_content = vc.root.extra_data['directory_content']
directory_structure = vc.root.extra_data['directory_structure']
expected_directory_content = '0000000346ce04751a3c98f00086f16a91d9790b'
Expand Down
22 changes: 22 additions & 0 deletions matchcode/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,28 @@ class MultipleCharFilter(MultipleChoiceFilter):
field_class = MultipleCharField


# TODO: Think of a better name for this filter
class MultipleCharInFilter(MultipleCharFilter):
def filter(self, qs, value):
if not value:
# Even though not a noop, no point filtering if empty.
return qs

if self.is_noop(qs, value):
return qs

predicate = self.get_filter_predicate(value)
old_field_name = next(iter(predicate))
new_field_name = f'{old_field_name}__in'
predicate[new_field_name] = predicate[old_field_name]
predicate.pop(old_field_name)

q = Q(**predicate)
qs = self.get_method(qs)(q)

return qs.distinct() if self.distinct else qs


class MultipleSHA1Filter(MultipleCharFilter):
"""
Overrides `MultipleCharFilter.filter()` to convert the SHA1
Expand Down
4 changes: 2 additions & 2 deletions matchcode/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from commoncode.resource import VirtualCodebase

from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode.models import ApproximateDirectoryContentIndex
from matchcode.models import ApproximateDirectoryStructureIndex
from matchcode.models import ExactPackageArchiveIndex
Expand Down Expand Up @@ -150,5 +150,5 @@ def index_package_directories(package):
if not vc:
return 0, 0

vc = compute_directory_fingerprints(vc)
vc = compute_codebase_directory_fingerprints(vc)
return index_directory_fingerprints(vc, package)
4 changes: 2 additions & 2 deletions matchcode/tests/test_index_packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from commoncode.resource import VirtualCodebase

from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode_toolkit.fingerprinting import hexstring_to_binarray
from matchcode.indexing import _create_virtual_codebase_from_package_resources
from matchcode.indexing import index_directory_fingerprints
Expand Down Expand Up @@ -155,7 +155,7 @@ def test__create_virtual_codebase_from_package_resources(self):

def test_index_directory_fingerprints(self):
vc = _create_virtual_codebase_from_package_resources(self.test_package1)
vc = compute_directory_fingerprints(vc)
vc = compute_codebase_directory_fingerprints(vc)

# Ensure tables are empty prior to indexing
self.assertFalse(ApproximateDirectoryContentIndex.objects.all())
Expand Down
4 changes: 2 additions & 2 deletions matchcode/tests/test_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from commoncode.resource import VirtualCodebase
from packagedb.models import Package

from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode.management.commands.index_packages import index_package_directories
from matchcode.match import EXACT_PACKAGE_ARCHIVE_MATCH
from matchcode.match import APPROXIMATE_DIRECTORY_STRUCTURE_MATCH
Expand All @@ -37,7 +37,7 @@ def run_do_match_from_scan(scan_file_location, match_type):
matched_to=attr.ib(default=attr.Factory(list))
)
)
vc = compute_directory_fingerprints(vc)
vc = compute_codebase_directory_fingerprints(vc)
do_match(vc, match_type)
return vc

Expand Down
6 changes: 3 additions & 3 deletions matchcode/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from packagedb.models import Package
import attr

from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints
from matchcode_toolkit.fingerprinting import hexstring_to_binarray
from matchcode.management.commands.index_packages import index_package_directories
from matchcode.models import ApproximateDirectoryContentIndex
Expand Down Expand Up @@ -169,7 +169,7 @@ def test_ApproximateDirectoryStructureIndex_match_subdir(self):
location=scan_location,
resource_attributes=dict(packages=attr.ib(default=attr.Factory(list)))
)
codebase = compute_directory_fingerprints(vc)
codebase = compute_codebase_directory_fingerprints(vc)

# populate codebase with match results
for resource in codebase.walk(topdown=True):
Expand All @@ -192,7 +192,7 @@ def test_ApproximateDirectoryContentIndex_match_subdir(self):
location=scan_location,
resource_attributes=dict(packages=attr.ib(default=attr.Factory(list)))
)
codebase = compute_directory_fingerprints(vc)
codebase = compute_codebase_directory_fingerprints(vc)

# populate codebase with match results
for resource in codebase.walk(topdown=True):
Expand Down
109 changes: 60 additions & 49 deletions minecode/management/commands/process_scans.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,58 +61,69 @@ def process_scan(cls, scannable_uri, get_scan_info_save_loc='', get_scan_data_sa
if scannable_uri.scan_status in (ScannableURI.SCAN_SUBMITTED, ScannableURI.SCAN_IN_PROGRESS):
scannable_uri.scan_status = get_scan_status(scan_info)
elif scannable_uri.scan_status in (ScannableURI.SCAN_COMPLETED,):
logger.info('Indexing scanned files for URI: {}'.format(scannable_uri))

package = scannable_uri.package
scan_data = scanning.get_scan_data(
scannable_uri.scan_uuid,
api_url=cls.api_url,
api_auth_headers=cls.api_auth_headers,
get_scan_data_save_loc=get_scan_data_save_loc
)
scan_index_errors = index_package_files(package, scan_data)
scan_index_errors = []
try:
logger.info('Indexing scanned files for URI: {}'.format(scannable_uri))

package = scannable_uri.package
input_size = scan_info.size
if input_size:
computed_timeout = ((input_size / 1000000) / 2) * 60
timeout = max(computed_timeout, scanning.REQUEST_TIMEOUT)
else:
timeout = scanning.REQUEST_TIMEOUT
scan_data = scanning.get_scan_data(
scannable_uri.scan_uuid,
api_url=cls.api_url,
api_auth_headers=cls.api_auth_headers,
timeout=timeout,
get_scan_data_save_loc=get_scan_data_save_loc
)
scan_index_errors.extend(index_package_files(package, scan_data))

summary = scanning.get_scan_summary(
scannable_uri.scan_uuid,
api_url=cls.api_url,
api_auth_headers=cls.api_auth_headers,
get_scan_data_save_loc=get_scan_data_save_loc
)
summary = scanning.get_scan_summary(
scannable_uri.scan_uuid,
api_url=cls.api_url,
api_auth_headers=cls.api_auth_headers,
get_scan_data_save_loc=get_scan_data_save_loc
)

other_license_expressions = summary.get('other_license_expressions', [])
other_license_expressions = [l['value'] for l in other_license_expressions if l['value']]
other_license_expression = combine_expressions(other_license_expressions)

copyright = ''
declared_holder = summary.get('declared_holder')
if declared_holder:
copyright = f'Copyright (c) {declared_holder}'

values_by_updateable_fields = {
'sha1': scan_info.sha1,
'sha256': scan_info.sha256,
'sha512': scan_info.sha512,
'summary': summary,
'declared_license_expression': summary.get('declared_license_expression'),
'other_license_expression': other_license_expression,
'copyright': copyright,
}

for field, value in values_by_updateable_fields.items():
p_val = getattr(package, field)
if not p_val and value:
setattr(package, field, value)
package_updated = True

if package_updated:
package.save()

other_license_expressions = summary.get('other_license_expressions', [])
other_license_expressions = [l['value'] for l in other_license_expressions]
other_license_expression = combine_expressions(other_license_expressions)

copyright = ''
declared_holder = summary.get('declared_holder')
if declared_holder:
copyright = f'Copyright (c) {declared_holder}'

values_by_updateable_fields = {
'sha1': scan_info.sha1,
'sha256': scan_info.sha256,
'sha512': scan_info.sha512,
'summary': summary,
'declared_license_expression': summary.get('declared_license_expression'),
'other_license_expression': other_license_expression,
'copyright': copyright,
}

for field, value in values_by_updateable_fields.items():
p_val = getattr(package, field)
if not p_val and value:
setattr(package, field, value)
package_updated = True

if package_updated:
package.save()

# TODO: We should rerun the specific indexers that have failed
if scan_index_errors:
scannable_uri.index_error = '\n'.join(scan_index_errors)
scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED
else:
scannable_uri.scan_status = ScannableURI.SCAN_INDEXED
except Exception as e:
error_message = str(e) + '\n'
# TODO: We should rerun the specific indexers that have failed
if scan_index_errors:
error_message += '\n'.join(scan_index_errors)
scannable_uri.index_error
scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED

scannable_uri.wip_date = None
scannable_uri.save()
Expand Down
Loading

0 comments on commit ea522de

Please sign in to comment.