Skip to content

Commit c10573b

Browse files
duanbingduanbing.0
andauthored
Attribution support (#418)
* Support attribution * Feature: enable negative example auto-generate * Fix unexpected converion of bool in argparse * Bugfix: meta.example_ids maybe dup * Code format * Add log * Pass the neg generating flag * Pass the neg generating flag Co-authored-by: duanbing.0 <[email protected]>
1 parent f44e2e7 commit c10573b

File tree

13 files changed

+1087
-8
lines changed

13 files changed

+1087
-8
lines changed

deploy/scripts/data_join/run_data_join_worker.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ example_id_dump_threshold=$(normalize_env_to_args "--example_id_dump_threshold"
3636
data_block_builder=$(normalize_env_to_args "--data_block_builder" $DATA_BLOCK_BUILDER)
3737
data_block_compressed_type=$(normalize_env_to_args "--data_block_compressed_type" $DATA_BLOCK_COMPRESSED_TYPE)
3838
kvstore_type=$(normalize_env_to_args '--kvstore_type' $KVSTORE_TYPE)
39-
39+
max_conversion_delay=$(normalize_env_to_args '--max_conversion_delay' $MAX_CONVERSION_DELAY)
40+
enable_negative_example_generator=$(normalize_env_to_args '--enable_negative_example_generator' $ENABLE_NEGATIVE_EXAMPLE_GENERATOR)
4041
python -m fedlearner.data_join.cmd.data_join_worker_service \
4142
$PEER_ADDR \
4243
$MASTER_POD_NAMES \
@@ -47,4 +48,5 @@ python -m fedlearner.data_join.cmd.data_join_worker_service \
4748
$data_block_dump_interval $data_block_dump_threshold \
4849
$example_id_dump_interval $example_id_dump_threshold \
4950
$data_block_builder $data_block_compressed_type \
50-
$kvstore_type
51+
$kvstore_type $max_conversion_delay \
52+
$enable_negative_example_generator

fedlearner/common/argparse_util.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Copyright 2020 The FedLearner Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# coding: utf-8
16+
17+
import argparse
18+
19+
def str_as_bool(v):
20+
if isinstance(v, bool):
21+
return v
22+
if v.lower() in ('yes', 'true', 't', 'y', '1'):
23+
return True
24+
if v.lower() in ('no', 'false', 'f', 'n', '0'):
25+
return False
26+
raise argparse.ArgumentTypeError('Boolean value expected.')

fedlearner/data_join/cmd/data_join_worker_service.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,10 @@
2020
import tensorflow
2121

2222
from fedlearner.common import data_join_service_pb2 as dj_pb
23+
from fedlearner.common.argparse_util import str_as_bool
2324
from fedlearner.data_join.common import get_kvstore_config
2425
from fedlearner.data_join.data_join_worker import DataJoinWorkerService
26+
from fedlearner.data_join.common import interval_to_timestamp
2527
tensorflow.compat.v1.enable_eager_execution()
2628

2729
if __name__ == "__main__":
@@ -76,6 +78,14 @@
7678
parser.add_argument('--data_block_compressed_type', type=str, default='',
7779
choices=['', 'ZLIB', 'GZIP'],
7880
help='the compressed type for data block')
81+
parser.add_argument('--max_conversion_delay', type=str, default="7D",
82+
help='the max delay of an impression occurred '\
83+
'before a conversion as an attribution pair, unit: '\
84+
'{Y|M|D|H|N|S}, i.e. 1N20S equals 80 seconds')
85+
parser.add_argument('--enable_negative_example_generator', type=str_as_bool,
86+
default=False, const=True, nargs='?',
87+
help="enable the negative example auto-generator, "\
88+
"filled with label: 0")
7989
args = parser.parse_args()
8090
worker_options = dj_pb.DataJoinWorkerOptions(
8191
use_mock_etcd=(args.kvstore_type == 'mock'),
@@ -91,6 +101,10 @@
91101
max_matching_window=args.max_matching_window,
92102
data_block_dump_interval=args.data_block_dump_interval,
93103
data_block_dump_threshold=args.data_block_dump_threshold,
104+
max_conversion_delay=interval_to_timestamp(\
105+
args.max_conversion_delay),
106+
enable_negative_example_generator=\
107+
args.enable_negative_example_generator,
94108
),
95109
example_id_dump_options=dj_pb.ExampleIdDumpOptions(
96110
example_id_dump_interval=args.example_id_dump_interval,

fedlearner/data_join/common.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,3 +371,29 @@ def get_kvstore_config(kvstore_type):
371371
addr = os.environ.get('ETCD_ADDR', 'localhost:2379')
372372
base_dir = os.environ.get('ETCD_BASE_DIR', 'fedlearner')
373373
return name, addr, None, None, base_dir
374+
375+
def interval_to_timestamp(itv):
376+
unit = ["Y", "M", "D", "H", "N", "S"]
377+
multiple = [3600*24*30*12, 3600*24*30, 3600*24, 3600, 60, 1]
378+
unit_order, unit_no = {}, {}
379+
for i, item in enumerate(unit):
380+
unit_order[item] = len(unit) - i
381+
s_no = ""
382+
prv_order = len(unit) + 1
383+
for c in itv:
384+
if c.isdigit():
385+
s_no += c
386+
else:
387+
c = c.upper()
388+
if c not in unit_order or prv_order <= unit_order[c]:
389+
return None
390+
unit_no[c] = s_no
391+
prv_order = unit_order[c]
392+
s_no = ""
393+
tmstmp = 0
394+
if len(s_no) > 0 and "S" not in unit_no:
395+
unit_no["S"] = s_no
396+
for i, item in enumerate(unit):
397+
if item in unit_no:
398+
tmstmp += int(unit_no[item]) * multiple[i]
399+
return tmstmp

fedlearner/data_join/data_block_dumper.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,9 @@ def _dump_data_block_by_meta(self, meta):
164164
example_num = len(meta.example_ids)
165165
for (index, item) in self._raw_data_visitor:
166166
example_id = item.example_id
167-
if example_id == meta.example_ids[match_index]:
167+
# ELements in meta.example_ids maybe duplicated
168+
while match_index < example_num and\
169+
example_id == meta.example_ids[match_index]:
168170
data_block_builder.write_item(item)
169171
match_index += 1
170172
if match_index >= example_num:

fedlearner/data_join/data_block_manager.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,8 @@ def init_by_meta(self, meta):
6363
def set_data_block_manager(self, data_block_manager):
6464
self._data_block_manager = data_block_manager
6565

66-
def append_item(self, item, leader_index, follower_index, event_time=None):
66+
def append_item(self, item, leader_index, follower_index, event_time=None,\
67+
allow_dup=False):
6768
example_id = item.example_id
6869
if event_time is None:
6970
event_time = item.event_time
@@ -75,10 +76,18 @@ def append_item(self, item, leader_index, follower_index, event_time=None):
7576
self._data_block_meta.start_time = event_time
7677
self._data_block_meta.end_time = event_time
7778
else:
78-
assert self._data_block_meta.leader_start_index < leader_index, \
79-
"leader start index should be incremental"
80-
assert self._data_block_meta.leader_end_index < leader_index, \
81-
"leader end index should be incremental"
79+
if not allow_dup:
80+
assert self._data_block_meta.leader_start_index < leader_index,\
81+
"leader start index should be incremental"
82+
assert self._data_block_meta.leader_end_index < leader_index, \
83+
"leader end index should be incremental"
84+
else:
85+
assert self._data_block_meta.leader_start_index <= \
86+
leader_index,\
87+
"leader start index should be incremental by GE"
88+
assert self._data_block_meta.leader_end_index <= leader_index, \
89+
"leader end index should be incremental by LE"
90+
8291
self._data_block_meta.leader_end_index = leader_index
8392
if event_time < self._data_block_meta.start_time:
8493
self._data_block_meta.start_time = event_time

0 commit comments

Comments
 (0)