Skip to content

Commit 94b4d7c

Browse files
committed
1 parent a23df30 commit 94b4d7c

File tree

10 files changed

+391
-140
lines changed

10 files changed

+391
-140
lines changed

alerts.yaml

Whitespace-only changes.

metadata.json

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
{
2+
"type": "connector",
3+
"metadata": {
4+
"id": "object-store-connector",
5+
"name": "Object Store Connector",
6+
"version": "1.0.0",
7+
"tenant": "multiple",
8+
"category": "File",
9+
"description": "The Object Store Connector is used to move data from any Object Store to the Obsrv platform",
10+
"technology": "python",
11+
"runtime": "spark",
12+
"licence": "MIT",
13+
"owner": "Sunbird",
14+
"main_class": null,
15+
"main_program": "object_store_connector"
16+
},
17+
"connectors": [
18+
{
19+
"id": "aws-s3-connector",
20+
"name": "AWS S3",
21+
"description": "The AWS S3 Connector is used to move data from any S3 Bucket to the Obsrv platform",
22+
"icon": "https://upload.wikimedia.org/wikipedia/commons/b/bc/Amazon-S3-Logo.svg",
23+
"config": {
24+
"source": {
25+
"type": "s3",
26+
"bucket": "",
27+
"prefix": "",
28+
"prefix_format": "%y/%m/%d/%H",
29+
"credentials": {
30+
"access_key": "",
31+
"secret_key": "",
32+
"region": ""
33+
},
34+
"max_retries": "<int>"
35+
}
36+
}
37+
},
38+
{
39+
"id": "azure-blob-connector",
40+
"name": "Azure Blob Store",
41+
"description": "The Azure Blob Store Connector is used to move data from any Azure Blob Container to the Obsrv platform",
42+
"icon": "https://upload.wikimedia.org/wikipedia/commons/f/fa/Microsoft_Azure.svg",
43+
"config": {
44+
"source": {
45+
"type": "azure_blob",
46+
"container": "",
47+
"prefix": "",
48+
"prefix_format": "%y/%m/%d/%H",
49+
"credentials": {
50+
"account_name": "",
51+
"account_key": ""
52+
},
53+
"max_retries": "<int>"
54+
}
55+
}
56+
},
57+
{
58+
"id": "gcs-connector",
59+
"name": "Google Cloud Storage",
60+
"description": "The GCS Connector is used to move data from any Google Bucket to the Obsrv platform",
61+
"icon": "https://upload.wikimedia.org/wikipedia/commons/thumb/5/51/Google_Cloud_logo.svg/512px-Google_Cloud_logo.svg.png",
62+
"config": {
63+
"source": {
64+
"type": "gcs",
65+
"bucket": "",
66+
"prefix": "",
67+
"prefix_format": "%y/%m/%d/%H",
68+
"credentials": {
69+
"project_id": "",
70+
"client_email": "",
71+
"private_key": ""
72+
}
73+
},
74+
"max_retries": "<int>"
75+
}
76+
}
77+
]
78+
}

metrics.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
metrics:
2+
- name: num_api_calls
3+
alias: Number of API Calls
4+
description: The number of API calls made to the cloud provider

object_store_connector/__main__.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import os
2-
from obsrv.connector.batch import SourceConnector
2+
33
from connector import ObjectStoreConnector
4+
from obsrv.connector.batch import SourceConnector
5+
46
# from obsrv.utils import Config
57

6-
if __name__ == '__main__':
8+
if __name__ == "__main__":
79
connector = ObjectStoreConnector()
8-
config_file_path = os.path.join(os.path.dirname(__file__), 'config/config.yaml')
10+
config_file_path = os.path.join(os.path.dirname(__file__), "config/config.yaml")
911

10-
SourceConnector.process(connector=connector, config_file_path=config_file_path)
12+
SourceConnector.process(connector=connector, config_file_path=config_file_path)
Lines changed: 82 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,24 @@
11
import datetime
2-
import time
32
import json
3+
import time
44
from typing import Any, Dict, Iterator
5+
6+
from models.object_info import ObjectInfo
57
from obsrv.common import ObsrvException
8+
from obsrv.connector import ConnectorContext, MetricsCollector
69
from obsrv.connector.batch import ISourceConnector
7-
from obsrv.connector import ConnectorContext
8-
from obsrv.connector import MetricsCollector
9-
from obsrv.models import ErrorData, StatusCode, ExecutionState
10+
from obsrv.models import ErrorData, ExecutionState, StatusCode
1011
from obsrv.utils import LoggerController
11-
12-
from pyspark.sql import SparkSession, DataFrame
12+
from provider.s3 import S3
1313
from pyspark.conf import SparkConf
14+
from pyspark.sql import DataFrame, SparkSession
1415
from pyspark.sql.functions import lit
15-
from pyspark.sql.types import *
16-
17-
from provider.s3 import S3
18-
from models.object_info import ObjectInfo
1916

2017
logger = LoggerController(__name__)
2118

2219
MAX_RETRY_COUNT = 10
2320

21+
2422
class ObjectStoreConnector(ISourceConnector):
2523
def __init__(self):
2624
self.provider = None
@@ -30,21 +28,34 @@ def __init__(self):
3028
self.error_state = StatusCode.FAILED.value
3129
self.running_state = ExecutionState.RUNNING.value
3230
self.not_running_state = ExecutionState.NOT_RUNNING.value
33-
self.queued_state = ExecutionState.QUEUED.value
34-
35-
def process(self, sc: SparkSession, ctx: ConnectorContext, connector_config: Dict[Any, Any], metrics_collector: MetricsCollector) -> Iterator[DataFrame]:
36-
if (ctx.state.get_state("status", default_value=self.not_running_state) == self.running_state):
31+
self.queued_state = ExecutionState.QUEUED.value
32+
33+
def process(
34+
self,
35+
sc: SparkSession,
36+
ctx: ConnectorContext,
37+
connector_config: Dict[Any, Any],
38+
metrics_collector: MetricsCollector,
39+
) -> Iterator[DataFrame]:
40+
if (
41+
ctx.state.get_state("status", default_value=self.not_running_state)
42+
== self.running_state
43+
):
3744
logger.info("Connector is already running. Skipping processing.")
3845
return
3946

4047
ctx.state.put_state("status", self.running_state)
4148
ctx.state.save_state()
42-
self.max_retries = connector_config["source"]["max_retries"] if "max_retries" in connector_config["source"] else MAX_RETRY_COUNT
49+
self.max_retries = (
50+
connector_config["source"]["max_retries"]
51+
if "max_retries" in connector_config["source"]
52+
else MAX_RETRY_COUNT
53+
)
4354
self._get_provider(connector_config)
4455
self._get_objects_to_process(ctx, metrics_collector)
4556
for res in self._process_objects(sc, ctx, metrics_collector):
4657
yield res
47-
58+
4859
last_run_time = datetime.datetime.now()
4960
ctx.state.put_state("status", self.not_running_state)
5061
ctx.state.put_state("last_run_time", last_run_time)
@@ -54,67 +65,99 @@ def get_spark_conf(self, connector_config) -> SparkConf:
5465
self._get_provider(connector_config)
5566
if self.provider is not None:
5667
return self.provider.get_spark_config(connector_config)
57-
68+
5869
return SparkConf()
5970

6071
def _get_provider(self, connector_config: Dict[Any, Any]):
61-
if connector_config["source"]["type"] == "s3":
72+
if connector_config["source"]["type"] == "s3":
6273
self.provider = S3(connector_config)
6374
else:
64-
ObsrvException(ErrorData("INVALID_PROVIDER", "provider not supported: {}".format(connector_config["source"]["type"])))
65-
66-
def _get_objects_to_process(self, ctx: ConnectorContext, metrics_collector: MetricsCollector) -> None:
75+
ObsrvException(
76+
ErrorData(
77+
"INVALID_PROVIDER",
78+
"provider not supported: {}".format(
79+
connector_config["source"]["type"]
80+
),
81+
)
82+
)
83+
84+
def _get_objects_to_process(
85+
self, ctx: ConnectorContext, metrics_collector: MetricsCollector
86+
) -> None:
6787
objects = ctx.state.get_state("to_process", list())
6888
if ctx.building_block is not None and ctx.env is not None:
6989
self.dedupe_tag = "{}-{}".format(ctx.building_block, ctx.env)
7090
else:
71-
raise ObsrvException(ErrorData("INVALID_CONTEXT", "building_block or env not found in context"))
72-
73-
if not len(objects):
74-
num_files_discovered = ctx.stats.get_stat('num_files_discovered', 0)
91+
raise ObsrvException(
92+
ErrorData(
93+
"INVALID_CONTEXT", "building_block or env not found in context"
94+
)
95+
)
96+
97+
if not len(objects):
98+
num_files_discovered = ctx.stats.get_stat("num_files_discovered", 0)
7599
objects = self.provider.fetch_objects(ctx, metrics_collector)
76100
objects = self._exclude_processed_objects(ctx, objects)
77101
metrics_collector.collect("new_objects_discovered", len(objects))
78102
ctx.state.put_state("to_process", objects)
79103
ctx.state.save_state()
80104
num_files_discovered += len(objects)
81-
ctx.stats.put_stat("num_files_discovered", num_files_discovered)
105+
ctx.stats.put_stat("num_files_discovered", num_files_discovered)
82106
ctx.stats.save_stats()
83107

84108
self.objects = objects
85109

86-
def _process_objects(self, sc: SparkSession, ctx: ConnectorContext, metrics_collector: MetricsCollector) -> Iterator[DataFrame]:
87-
num_files_processed = ctx.stats.get_stat('num_files_processed', 0)
110+
def _process_objects(
111+
self,
112+
sc: SparkSession,
113+
ctx: ConnectorContext,
114+
metrics_collector: MetricsCollector,
115+
) -> Iterator[DataFrame]:
116+
num_files_processed = ctx.stats.get_stat("num_files_processed", 0)
88117
for i in range(0, len(self.objects)):
89118
obj = self.objects[i]
90119
obj["start_processing_time"] = time.time()
91-
columns = StructType([])
92-
df = self.provider.read_object(obj.get("location"), sc=sc, metrics_collector=metrics_collector, file_format=ctx.data_format)
120+
df = self.provider.read_object(
121+
obj.get("location"),
122+
sc=sc,
123+
metrics_collector=metrics_collector,
124+
file_format=ctx.data_format,
125+
)
93126

94127
if df is None:
95128
obj["num_of_retries"] += 1
96129
if obj["num_of_retries"] < self.max_retries:
97130
ctx.state.put_state("to_process", self.objects[i:])
98131
ctx.state.save_state()
99132
else:
100-
if not self.provider.update_tag(object=obj, tags=[{"key": self.dedupe_tag, "value": self.error_state}], metrics_collector=metrics_collector):
133+
if not self.provider.update_tag(
134+
object=obj,
135+
tags=[{"key": self.dedupe_tag, "value": self.error_state}],
136+
metrics_collector=metrics_collector,
137+
):
101138
break
102139
return
103140
else:
104141
df = self._append_custom_meta(sc, df, obj)
105-
obj["download_time"] = time.time()-obj.get("start_processing_time")
106-
if not self.provider.update_tag(object=obj, tags=[{"key": self.dedupe_tag, "value": self.success_state}], metrics_collector=metrics_collector):
142+
obj["download_time"] = time.time() - obj.get("start_processing_time")
143+
if not self.provider.update_tag(
144+
object=obj,
145+
tags=[{"key": self.dedupe_tag, "value": self.success_state}],
146+
metrics_collector=metrics_collector,
147+
):
107148
break
108-
ctx.state.put_state("to_process", self.objects[i+1:])
149+
ctx.state.put_state("to_process", self.objects[i + 1 :])
109150
ctx.state.save_state()
110151
num_files_processed += 1
111-
ctx.stats.put_stat("num_files_processed",num_files_processed)
152+
ctx.stats.put_stat("num_files_processed", num_files_processed)
112153
obj["end_processing_time"] = time.time()
113154
yield df
114-
155+
115156
ctx.stats.save_stats()
116157

117-
def _append_custom_meta(self, sc: SparkSession, df: DataFrame, object: ObjectInfo) -> DataFrame:
158+
def _append_custom_meta(
159+
self, sc: SparkSession, df: DataFrame, object: ObjectInfo
160+
) -> DataFrame:
118161
addn_meta = {
119162
"location": object.get("location"),
120163
"file_size_kb": object.get("file_size_kb"),
@@ -123,7 +166,7 @@ def _append_custom_meta(self, sc: SparkSession, df: DataFrame, object: ObjectInf
123166
"end_processing_time": object.get("end_processing_time"),
124167
"file_hash": object.get("file_hash"),
125168
"num_of_retries": object.get("num_of_retries"),
126-
"in_time": object.get("in_time")
169+
"in_time": object.get("in_time"),
127170
}
128171
df = df.withColumn("_addn_source_meta", lit(json.dumps(addn_meta, default=str)))
129172
return df
@@ -134,4 +177,4 @@ def _exclude_processed_objects(self, ctx: ConnectorContext, objects):
134177
if not any(tag["key"] == self.dedupe_tag for tag in obj.get("tags")):
135178
to_be_processed.append(obj)
136179

137-
return to_be_processed
180+
return to_be_processed
Lines changed: 19 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,20 @@
11
from dataclasses import dataclass, field
2+
from datetime import datetime
23
from typing import List
34
from uuid import uuid4
4-
from datetime import datetime
5+
56

67
@dataclass
78
class Tag:
89
key: str
910
value: str
1011

1112
def to_dict(self):
12-
return {
13-
'key': self.key,
14-
'value': self.value
15-
}
13+
return {"key": self.key, "value": self.value}
1614

1715
def to_aws(self):
18-
return {
19-
'Key': self.key,
20-
'Value': self.value
21-
}
16+
return {"Key": self.key, "Value": self.value}
17+
2218

2319
@dataclass
2420
class ObjectInfo:
@@ -38,17 +34,17 @@ class ObjectInfo:
3834

3935
def to_json(self):
4036
return {
41-
'id': self.id,
42-
'connector_id': self.connector_id,
43-
'dataset_id': self.dataset_id,
44-
'location': self.location,
45-
'format': self.format,
46-
'file_size_kb': self.file_size_kb,
47-
'in_time': self.in_time,
48-
'download_time': self.download_time,
49-
'start_processing_time': self.start_processing_time,
50-
'end_processing_time': self.end_processing_time,
51-
'file_hash': self.file_hash,
52-
'num_of_retries': self.num_of_retries,
53-
'tags': [tag.__dict__ for tag in self.tags]
54-
}
37+
"id": self.id,
38+
"connector_id": self.connector_id,
39+
"dataset_id": self.dataset_id,
40+
"location": self.location,
41+
"format": self.format,
42+
"file_size_kb": self.file_size_kb,
43+
"in_time": self.in_time,
44+
"download_time": self.download_time,
45+
"start_processing_time": self.start_processing_time,
46+
"end_processing_time": self.end_processing_time,
47+
"file_hash": self.file_hash,
48+
"num_of_retries": self.num_of_retries,
49+
"tags": [tag.__dict__ for tag in self.tags],
50+
}

0 commit comments

Comments
 (0)