24
24
from kubeflow .training .constants import constants
25
25
from kubeflow .training .utils import utils
26
26
27
- logging .basicConfig (format = "%(message)s" )
28
- logging .getLogger ().setLevel (logging .INFO )
27
+ logger = logging .getLogger (__name__ )
29
28
30
29
status_logger = utils .StatusLogger (
31
30
header = "{:<30.30} {:<20.20} {}" .format ("NAME" , "STATE" , "TIME" ),
@@ -222,7 +221,7 @@ def create_job(
222
221
f"Failed to create { job_kind } : { namespace } /{ job .metadata .name } "
223
222
)
224
223
225
- logging . info (f"{ job_kind } { namespace } /{ job .metadata .name } has been created" )
224
+ logger . debug (f"{ job_kind } { namespace } /{ job .metadata .name } has been created" )
226
225
227
226
def get_job (
228
227
self ,
@@ -771,7 +770,7 @@ def get_job_logs(
771
770
replica_index : Optional [int ] = None ,
772
771
follow : bool = False ,
773
772
timeout : int = constants .DEFAULT_TIMEOUT ,
774
- ):
773
+ ) -> Dict [ str , str ] :
775
774
"""Print the training logs for the Job. By default it returns logs from
776
775
the `master` pod.
777
776
@@ -801,6 +800,10 @@ def get_job_logs(
801
800
timeout: Optional, Kubernetes API server timeout in seconds
802
801
to execute the request.
803
802
803
+ Returns:
804
+ Dict[str, str]: A dictionary in which the keys are pod names and the
805
+ values are the corresponding logs.
806
+
804
807
Raises:
805
808
ValueError: Job replica type is invalid.
806
809
TimeoutError: Timeout to get Job pods.
@@ -819,6 +822,7 @@ def get_job_logs(
819
822
timeout = timeout ,
820
823
)
821
824
825
+ logs_dict = {}
822
826
if pods and follow :
823
827
log_streams = []
824
828
for pod in pods :
@@ -849,7 +853,7 @@ def get_job_logs(
849
853
if logline is None :
850
854
finished [index ] = True
851
855
break
852
- logging . info ( "[Pod %s]: %s" , pods [index ], logline )
856
+ print ( f "[Pod { pods [index ]} ]: { logline } " )
853
857
except queue .Empty :
854
858
break
855
859
elif pods :
@@ -860,10 +864,12 @@ def get_job_logs(
860
864
namespace ,
861
865
container = constants .JOB_PARAMETERS [job_kind ]["container" ],
862
866
)
863
- logging . info ( "The logs of pod %s: \n %s" , pod , pod_logs )
867
+ logs_dict [ pod ] = pod_logs
864
868
except Exception :
865
869
raise RuntimeError (f"Failed to read logs for pod { namespace } /{ pod } " )
866
870
871
+ return logs_dict
872
+
867
873
def update_job (
868
874
self ,
869
875
job : constants .JOB_MODELS_TYPE ,
@@ -908,7 +914,7 @@ def update_job(
908
914
except Exception :
909
915
raise RuntimeError (f"Failed to update { job_kind } : { namespace } /{ name } " )
910
916
911
- logging . info (f"{ job_kind } { namespace } /{ name } has been updated" )
917
+ logger . debug (f"{ job_kind } { namespace } /{ name } has been updated" )
912
918
913
919
def delete_job (
914
920
self ,
@@ -950,4 +956,4 @@ def delete_job(
950
956
except Exception :
951
957
raise RuntimeError (f"Failed to delete { job_kind } : { namespace } /{ name } " )
952
958
953
- logging . info (f"{ job_kind } { namespace } /{ name } has been deleted" )
959
+ logger . debug (f"{ job_kind } { namespace } /{ name } has been deleted" )
0 commit comments