Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 114 additions & 0 deletions cassis/cas.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
TypeCheckError,
TypeSystem,
TypeSystemMode,
load_typesystem,
)

_validator_optional_string = validators.optional(validators.instance_of(str))
Expand Down Expand Up @@ -832,6 +833,119 @@ def _copy(self) -> "Cas":
result._xmi_id_generator = self._xmi_id_generator
return result

def deep_copy(self, copy_typesystem: bool = False) -> "Cas":
"""
Create and return a deep copy of this CAS object.
All feature structures, views, and sofas are copied. If `copy_typesystem` is True, the typesystem is also deep-copied;
otherwise, the original typesystem is shared between the original and the copy.
Args:
copy_typesystem (bool): Whether to copy the original typesystem or not. If True, the typesystem is deep-copied.
Returns:
Cas: A deep copy of this CAS object.
"""
ts = self.typesystem
if copy_typesystem:
ts = self.typesystem.to_xml()
ts = load_typesystem(ts)

cas_copy = Cas(ts,
document_language=self.document_language,
lenient=self._lenient,
sofa_mime=self.sofa_mime,
)

Comment on lines +851 to +856
Copy link

Copilot AI Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On line 852, document_language is read from self.document_language, which accesses the document annotation. If the original CAS doesn't have a document annotation, this will create one, which might not be desired behavior during copying. Consider handling the case where document_language is None or catching exceptions when the document annotation doesn't exist.

Suggested change
cas_copy = Cas(ts,
document_language=self.document_language,
lenient=self._lenient,
sofa_mime=self.sofa_mime,
)
# Safely determine the document language without assuming that a
# document annotation already exists on the original CAS.
document_language = None
try:
document_language = self.document_language
except AttributeError:
document_language = None
cas_copy = Cas(
ts,
document_language=document_language,
lenient=self._lenient,
sofa_mime=self.sofa_mime,
)

Copilot uses AI. Check for mistakes.
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see a problem here, given that document_language defaults to None on initiation of CAS if not supplied with an actual value.

cas_copy._views = {}
cas_copy._sofas = {}

for sofa in self.sofas:

sofa_copy = Sofa(
sofaID=sofa.sofaID,
sofaNum=sofa.sofaNum,
type=ts.get_type(sofa.type.name),
xmiID=sofa.xmiID,
)
sofa_copy.mimeType = sofa.mimeType
sofa_copy.sofaArray = sofa.sofaArray
Copy link

Copilot AI Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On line 869, sofaArray is copied by direct assignment, which creates a shallow copy. If sofaArray contains mutable data (like a list or array), modifications to one could affect the other. Consider using copy.copy() or copy.deepcopy() depending on the data structure.

Copilot uses AI. Check for mistakes.
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

non issue because sofaArray is a string

sofa_copy.sofaString = sofa.sofaString
sofa_copy.sofaURI = sofa.sofaURI

cas_copy._sofas[sofa_copy.sofaID] = sofa_copy
cas_copy._views[sofa_copy.sofaID] = View(sofa=sofa_copy)

# removes the _IntialView created with the initialization of the copied CAS
Copy link

Copilot AI Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo in comment: "_IntialView" should be "_InitialView"

Suggested change
# removes the _IntialView created with the initialization of the copied CAS
# removes the _InitialView created with the initialization of the copied CAS

Copilot uses AI. Check for mistakes.
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes!

cas_copy._current_view = cas_copy._views["_InitialView"]

references = dict()
referenced_arrays = dict()

all_copied_fs = dict()
referenced_view = {}

for fs in self._find_all_fs():

# the referenced view is required when adding the fs to the copied cas later
if hasattr(fs, 'sofa') and fs.sofa and hasattr(fs, 'xmiID') and fs.xmiID:
referenced_view[fs.xmiID] = fs.sofa.sofaID

t = ts.get_type(fs.type.name)
fs_copy = t()

for feature in t.all_features:
if ts.is_primitive(feature.rangeType):
fs_copy[feature.name] = fs.get(feature.name)
elif ts.is_primitive_collection(feature.rangeType):
fs_copy[feature.name] = ts.get_type(feature.rangeType.name)()
fs_copy[feature.name].elements = fs.get(feature.name).elements
Copy link

Copilot AI Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When copying primitive collections (line 899), the code directly assigns elements from the original collection to the copy. This creates a shallow copy of the elements list. If the elements are mutable (e.g., lists of bytes), modifications to one could affect the other. Consider creating a deep copy of the elements: fs_copy[feature.name].elements = list(fs.get(feature.name).elements) or using copy.deepcopy.

Suggested change
fs_copy[feature.name].elements = fs.get(feature.name).elements
fs_copy[feature.name].elements = list(fs.get(feature.name).elements)

Copilot uses AI. Check for mistakes.
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes!

elif ts.is_array(feature.rangeType):
fs_copy[feature.name] = ts.get_type(TYPE_NAME_FS_ARRAY)()
# collect referenced xmiIDs for mapping later
referenced_list = []
for item in fs[feature.name].elements:
if hasattr(item, 'xmiID') and item.xmiID is not None:
referenced_list.append(item.xmiID)
referenced_arrays.setdefault(fs.xmiID, {})
referenced_arrays[fs.xmiID][feature.name] = referenced_list
Comment on lines +900 to +908
Copy link

Copilot AI Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When copying arrays (line 904), the code doesn't handle None elements properly. If an element in the array is None, it will be skipped when building the referenced_list, but the copied array won't include None at that position. This could result in arrays with different lengths or element orders than the original. Consider preserving None elements or storing their positions.

Copilot uses AI. Check for mistakes.
Comment on lines +898 to +908
Copy link

Copilot AI Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code doesn't handle the case where a primitive collection feature value (line 897-899) or an array feature value (line 900-908) could be None. If fs.get(feature.name) returns None for these feature types, the code will fail with an AttributeError when trying to access .elements. Add a check for None before accessing .elements.

Suggested change
fs_copy[feature.name] = ts.get_type(feature.rangeType.name)()
fs_copy[feature.name].elements = fs.get(feature.name).elements
elif ts.is_array(feature.rangeType):
fs_copy[feature.name] = ts.get_type(TYPE_NAME_FS_ARRAY)()
# collect referenced xmiIDs for mapping later
referenced_list = []
for item in fs[feature.name].elements:
if hasattr(item, 'xmiID') and item.xmiID is not None:
referenced_list.append(item.xmiID)
referenced_arrays.setdefault(fs.xmiID, {})
referenced_arrays[fs.xmiID][feature.name] = referenced_list
original_value = fs.get(feature.name)
if original_value is not None:
fs_copy[feature.name] = ts.get_type(feature.rangeType.name)()
fs_copy[feature.name].elements = original_value.elements
else:
fs_copy[feature.name] = None
elif ts.is_array(feature.rangeType):
original_value = fs.get(feature.name)
if original_value is not None:
fs_copy[feature.name] = ts.get_type(TYPE_NAME_FS_ARRAY)()
# collect referenced xmiIDs for mapping later
referenced_list = []
for item in original_value.elements:
if hasattr(item, 'xmiID') and item.xmiID is not None:
referenced_list.append(item.xmiID)
referenced_arrays.setdefault(fs.xmiID, {})
referenced_arrays[fs.xmiID][feature.name] = referenced_list
else:
fs_copy[feature.name] = None

Copilot uses AI. Check for mistakes.
elif feature.rangeType.name == TYPE_NAME_SOFA:
# ignore sofa references
pass
else:
if hasattr(fs[feature.name], 'xmiID') and fs[feature.name].xmiID is not None:
references.setdefault(feature.name, [])
references[feature.name].append((fs.xmiID, fs[feature.name].xmiID))
Comment on lines +913 to +915
Copy link

Copilot AI Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code checks if fs[feature.name] has an xmiID attribute (line 913), but doesn't handle the case where fs[feature.name] could be None. This will cause an error. Add a None check before accessing attributes: if fs[feature.name] is not None and hasattr(...)

Suggested change
if hasattr(fs[feature.name], 'xmiID') and fs[feature.name].xmiID is not None:
references.setdefault(feature.name, [])
references[feature.name].append((fs.xmiID, fs[feature.name].xmiID))
feature_value = fs[feature.name]
if feature_value is not None and hasattr(feature_value, 'xmiID') and feature_value.xmiID is not None:
references.setdefault(feature.name, [])
references[feature.name].append((fs.xmiID, feature_value.xmiID))

Copilot uses AI. Check for mistakes.
else:
warnings.warn(f"Original non-primitive feature \"{feature.name}\" was and not copied from feature structure {fs.xmiID}.")
Copy link

Copilot AI Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Grammatical error in warning message: "was and not copied" should be "was not copied"

Suggested change
warnings.warn(f"Original non-primitive feature \"{feature.name}\" was and not copied from feature structure {fs.xmiID}.")
warnings.warn(f"Original non-primitive feature \"{feature.name}\" was not copied from feature structure {fs.xmiID}.")

Copilot uses AI. Check for mistakes.
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes!


Comment on lines +894 to +918
Copy link

Copilot AI Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The deep_copy method doesn't handle FSList (linked list) types. While it handles primitive collections and FSArrays, FSList types (like NonEmptyFSList) are not explicitly handled in the feature copying logic (lines 894-918). These will fall into the else clause on line 912, where they'll be treated as single references, which is incorrect for list structures. Consider adding explicit handling for FSList types similar to how FSArray is handled.

Copilot uses AI. Check for mistakes.
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@reckart: I haven't seen a CAS containing an FSList yet. Do you know where I could find one?

fs_copy.xmiID = fs.xmiID
all_copied_fs[fs_copy.xmiID] = fs_copy

# set references to single objects
for feature, pairs in references.items():
for current_ID, reference_ID in pairs:
try:
all_copied_fs[current_ID][feature] = all_copied_fs[reference_ID]
except KeyError as e:
warnings.warn(f"Reference {reference_ID} not found for feature '{feature}' of feature structure {current_ID}")

# set references for objects in arrays
for current_ID, arrays in referenced_arrays.items():
for feature, referenced_list in arrays.items():
elements = [all_copied_fs[reference_ID] for reference_ID in referenced_list]
all_copied_fs[current_ID][feature].elements = elements
Comment on lines +933 to +934
Copy link

Copilot AI Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On line 933, the code assumes all reference_IDs in referenced_list exist in all_copied_fs, but doesn't handle the case where a reference might be missing. This will raise a KeyError. Consider adding error handling similar to lines 925-928, or filter out missing references.

Suggested change
elements = [all_copied_fs[reference_ID] for reference_ID in referenced_list]
all_copied_fs[current_ID][feature].elements = elements
elements = []
for reference_ID in referenced_list:
try:
elements.append(all_copied_fs[reference_ID])
except KeyError:
warnings.warn(
f"Reference {reference_ID} not found for feature '{feature}' of feature structure {current_ID}"
)
try:
all_copied_fs[current_ID][feature].elements = elements
except KeyError:
warnings.warn(
f"Feature '{feature}' or feature structure {current_ID} not found when setting array references"
)

Copilot uses AI. Check for mistakes.

# add feature structures to the appropriate views
feature_structures = sorted(all_copied_fs.values(), key=lambda f: f.xmiID, reverse=False)
for item in all_copied_fs.values():
Copy link

Copilot AI Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variable 'feature_structures' is created on line 937 but never used. It appears this was meant to iterate over the sorted list, but line 938 iterates over all_copied_fs.values() instead, which is unsorted. This could lead to issues if feature structures need to be added in a specific order.

Suggested change
for item in all_copied_fs.values():
for item in feature_structures:

Copilot uses AI. Check for mistakes.
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but I would remove the unused variable instead.

if hasattr(item, 'xmiID') and item.xmiID is not None:
view_name = referenced_view.get(item.xmiID)
if view_name is not None:
cas_copy._current_view = cas_copy._views[view_name]
cas_copy.add(item, keep_id=True)

cas_copy._xmi_id_generator = IdGenerator(initial_id=self._xmi_id_generator._next_id)
cas_copy._sofa_num_generator = IdGenerator(initial_id=self._sofa_num_generator._next_id)
return cas_copy


def _sort_func(a: FeatureStructure) -> Tuple[int, int, int]:
d = a.__slots__
Expand Down
62 changes: 62 additions & 0 deletions tests/test_cas.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
AnnotationHasNoSofa,
)
from tests.fixtures import *
from tests.test_files.test_cas_generators import MultiFeatureRandomCasGenerator, MultiTypeRandomCasGenerator

# Cas

Expand Down Expand Up @@ -540,3 +541,64 @@ def test_covered_text_on_annotation_without_sofa():

with pytest.raises(AnnotationHasNoSofa):
ann.get_covered_text()


def test_deep_copy_without_typesystem(small_xmi, small_typesystem_xml):
org = load_cas_from_xmi(small_xmi, typesystem=load_typesystem(small_typesystem_xml))
copy = org.deep_copy(copy_typesystem=False)

assert org != copy
assert len(copy.to_json(pretty_print=True)) == len(org.to_json(pretty_print=True))
assert copy.to_json(pretty_print=True) == org.to_json(pretty_print=True)

assert org.typesystem == copy.typesystem


def test_deep_copy_with_typesystem(small_xmi, small_typesystem_xml):
org = load_cas_from_xmi(small_xmi, typesystem=load_typesystem(small_typesystem_xml))
copy = org.deep_copy(copy_typesystem=True)

assert org != copy
assert len(copy.to_json(pretty_print=True)) == len(org.to_json(pretty_print=True))
assert copy.to_json(pretty_print=True) == org.to_json(pretty_print=True)


assert org.typesystem != copy.typesystem
assert len(org.typesystem.to_xml()) == len(copy.typesystem.to_xml())
assert org.typesystem.to_xml() == copy.typesystem.to_xml()


def test_random_multi_type_random_deep_copy():
generator = MultiTypeRandomCasGenerator()
for i in range(0, 10):
generator.size = (i + 1) * 10
generator.type_count = i + 1
typesystem = generator.generate_type_system()
org = generator.generate_cas(typesystem)
print(f"CAS size: {sum(len(view.get_all_annotations()) for view in org.views)}")
copy = org.deep_copy(copy_typesystem=True)

org_text = org.to_xmi(pretty_print=True)
copy_text = copy.to_xmi(pretty_print=True)

assert org != copy
assert len(org_text) == len(copy_text)
assert org_text == copy_text


def test_random_multi_feature_deep_copy():
generator = MultiFeatureRandomCasGenerator()
for i in range(0, 10):
generator.size = (i + 1) * 10
typesystem = generator.generate_type_system()
org = generator.generate_cas(typesystem)
print(f"CAS size: {sum(len(view.get_all_annotations()) for view in org.views)}")
copy = org.deep_copy(copy_typesystem=True)

org_text = org.to_xmi(pretty_print=True)
copy_text = copy.to_xmi(pretty_print=True)

assert org != copy
assert len(org_text) == len(copy_text)
assert org_text == copy_text

Comment on lines +546 to +604
Copy link

Copilot AI Jan 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The tests don't cover edge cases such as: feature structures with None values in arrays, feature structures with None non-primitive features, empty arrays, or multiple views. Consider adding tests for these scenarios to ensure the deep_copy method handles them correctly.

Copilot uses AI. Check for mistakes.
Loading