Skip to content

Commit 86d26b5

Browse files
committed
Add parent state handling to CartesianProductStreamSlicer
1 parent 47a6f21 commit 86d26b5

File tree

4 files changed

+133
-8
lines changed

4 files changed

+133
-8
lines changed

airbyte-cdk/python/airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py

+30
Original file line numberDiff line numberDiff line change
@@ -80,12 +80,42 @@ def stream_slices(self) -> Iterable[StreamSlice]:
8080
yield StreamSlice(partition=partition, cursor_slice=cursor_slice)
8181

8282
def set_initial_state(self, stream_state: StreamState) -> None:
83+
"""
84+
Set the initial state for the cursors.
85+
86+
This method initializes the state for each partition cursor using the provided stream state.
87+
If a partition state is provided in the stream state, it will update the corresponding partition cursor with this state.
88+
89+
Additionally, it sets the parent state for partition routers that are based on parent streams. If a partition router
90+
does not have parent streams, this step will be skipped due to the default StreamSlicer implementation.
91+
92+
Args:
93+
stream_state (StreamState): The state of the streams to be set. The format of the stream state should be:
94+
{
95+
"states": [
96+
{
97+
"partition": {
98+
"partition_key": "value"
99+
},
100+
"cursor": {
101+
"last_updated": "2023-05-27T00:00:00Z"
102+
}
103+
}
104+
],
105+
"parent_state": {
106+
"parent_stream_name": {
107+
"last_updated": "2023-05-27T00:00:00Z"
108+
}
109+
}
110+
}
111+
"""
83112
if not stream_state:
84113
return
85114

86115
for state in stream_state["states"]:
87116
self._cursor_per_partition[self._to_partition_key(state["partition"])] = self._create_cursor(state["cursor"])
88117

118+
# Set parent state for partition routers based on parent streams
89119
self._partition_router.set_parent_state(stream_state)
90120

91121
def observe(self, stream_slice: StreamSlice, record: Record) -> None:

airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py

+25-4
Original file line numberDiff line numberDiff line change
@@ -164,13 +164,25 @@ def stream_slices(self) -> Iterable[StreamSlice]:
164164

165165
yield from stream_slices_for_parent
166166

167-
def set_parent_state(self, stream_state: Optional[StreamState]) -> None:
167+
def set_parent_state(self, stream_state: StreamState) -> None:
168168
"""
169169
Set the state of the parent streams.
170170
171171
Args:
172-
stream_state (Optional[StreamState]): The state of the streams to be set. If `parent_state` exists in the
172+
stream_state (StreamState): The state of the streams to be set. If `parent_state` exists in the
173173
stream_state, it will update the state of each parent stream with the corresponding state from the stream_state.
174+
175+
Example of state format:
176+
{
177+
"parent_state": {
178+
"parent_stream_name1": {
179+
"last_updated": "2023-05-27T00:00:00Z"
180+
},
181+
"parent_stream_name2": {
182+
"last_updated": "2023-05-27T00:00:00Z"
183+
}
184+
}
185+
}
174186
"""
175187
if not stream_state:
176188
return
@@ -183,12 +195,21 @@ def set_parent_state(self, stream_state: Optional[StreamState]) -> None:
183195
if parent_config.incremental_dependency:
184196
parent_config.stream.state = parent_state.get(parent_config.stream.name, {})
185197

186-
def get_parent_state(self) -> StreamState:
198+
def get_parent_state(self) -> Optional[Mapping[str, StreamState]]:
187199
"""
188200
Get the state of the parent streams.
189201
190202
Returns:
191203
StreamState: The current state of the parent streams.
204+
205+
Example of state format:
206+
{
207+
"parent_stream_name1": {
208+
"last_updated": "2023-05-27T00:00:00Z"
209+
},
210+
"parent_stream_name2": {
211+
"last_updated": "2023-05-27T00:00:00Z"
212+
}
213+
}
192214
"""
193-
parent_stream_name = self.parent_stream_configs[0].stream.name if self.parent_stream_configs else None
194215
return self._parent_state

airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py

+50
Original file line numberDiff line numberDiff line change
@@ -112,3 +112,53 @@ def stream_slices(self) -> Iterable[StreamSlice]:
112112
else:
113113
cursor_slice = {}
114114
yield StreamSlice(partition=partition, cursor_slice=cursor_slice)
115+
116+
def set_parent_state(self, stream_state: StreamState) -> None:
117+
"""
118+
Set the state of the parent streams.
119+
120+
This method tries to set the parent state for every stream slicer. If a stream slicer does not have parent streams,
121+
this will be skipped due to the default StreamSlicer implementation.
122+
123+
Args:
124+
stream_state (StreamState): The state of the streams to be set. If `parent_state` exists in the
125+
stream_state, it will update the state of each parent stream with the corresponding state from the stream_state.
126+
127+
Example of state format:
128+
{
129+
"parent_state": {
130+
"parent_stream_name_1": {
131+
"last_updated": "2023-05-27T00:00:00Z"
132+
},
133+
"parent_stream_name_2": {
134+
"last_updated": "2023-05-27T00:00:00Z"
135+
}
136+
}
137+
}
138+
"""
139+
for stream_slicer in self.stream_slicers:
140+
stream_slicer.set_parent_state(stream_state)
141+
142+
def get_parent_state(self) -> Optional[Mapping[str, StreamState]]:
143+
"""
144+
Get the state of the parent streams.
145+
146+
This method returns the combined parent states from all stream slicers. If a stream slicer does not have parent streams,
147+
this will be skipped due to the default StreamSlicer implementation.
148+
149+
Returns:
150+
Optional[Mapping[str, StreamState]]: The current state of the parent streams in a dictionary format.
151+
The returned format will be:
152+
{
153+
"parent_stream_name1": {
154+
"last_updated": "2023-05-27T00:00:00Z"
155+
},
156+
"parent_stream_name2": {
157+
"last_updated": "2023-05-27T00:00:00Z"
158+
}
159+
}
160+
"""
161+
combined_state = {}
162+
for s in self.stream_slicers:
163+
combined_state.update(s.get_parent_state())
164+
return combined_state

airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py

+28-4
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from abc import abstractmethod
66
from dataclasses import dataclass
7-
from typing import Iterable, Optional
7+
from typing import Iterable, Mapping, Optional
88

99
from airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider import RequestOptionsProvider
1010
from airbyte_cdk.sources.types import StreamSlice, StreamState
@@ -33,16 +33,40 @@ def set_parent_state(self, stream_state: StreamState) -> None:
3333
"""
3434
Set the state of the parent streams.
3535
36+
This method should only be defined if the slicer is based on some parent stream and needs to read this stream
37+
incrementally using the state.
38+
3639
Args:
37-
stream_state: The state of the streams to be set. This method can be overridden by subclasses.
40+
stream_state (StreamState): The state of the streams to be set. The expected format is a dictionary that includes
41+
'parent_state' which is a dictionary of parent state names to their corresponding state.
42+
Example:
43+
{
44+
"parent_state": {
45+
"parent_stream_name_1": { ... },
46+
"parent_stream_name_2": { ... },
47+
...
48+
}
49+
}
3850
"""
3951
pass
4052

41-
def get_parent_state(self) -> Optional[StreamState]:
53+
def get_parent_state(self) -> Optional[Mapping[str, StreamState]]:
4254
"""
4355
Get the state of the parent streams.
4456
57+
This method should only be defined if the slicer is based on some parent stream and needs to read this stream
58+
incrementally using the state.
59+
4560
Returns:
46-
The current state of the parent streams. This method can be overridden by subclasses.
61+
Optional[Mapping[str, StreamState]]: The current state of the parent streams in a dictionary format.
62+
The returned format will be:
63+
{
64+
"parent_stream_name1": {
65+
"last_updated": "2023-05-27T00:00:00Z"
66+
},
67+
"parent_stream_name2": {
68+
"last_updated": "2023-05-27T00:00:00Z"
69+
}
70+
}
4771
"""
4872
return None

0 commit comments

Comments
 (0)