The following function in Python will capture dataproc jobId, if anyone is interested:
import pyspark
sc = pyspark.SparkContext()
def extract_jobid(sc):
# Access the underlying SparkConf
spark_conf = sc.getConf()
# Get the value of spark.yarn.tags configuration
spark_conf = spark_conf.get("spark.yarn.tags")
# Extract the jobId from yarn_tags using string processing
# assuming yarn_tags format: "dataproc_job_<job_id>"
job_id = None
if yarn_tags:
tags = yarn_tags.split(",")
for tag in tags:
if (tag.startswith("dataproc_job_") and not tag.startswith("dataproc_job_attempt_timestamp_")):
job_id = tag.split("_")[2]
break
return job_id
# Simply call to the function to output the dataproc jobId
extract_jobid(sc)
This is the answer in Python if anyone is interested:
import pyspark
sc = pyspark.SparkContext()
def extract_jobid(sc):
# Access the underlying SparkConf
spark_conf = sc.getConf()
# Get the value of spark.yarn.tags configuration
spark_conf = spark_conf.get("spark.yarn.tags")
# Extract the jobId from yarn_tags using string processing
# assuming yarn_tags format: "dataproc_job_<job_id>"
job_id = None
if yarn_tags:
tags = yarn_tags.split(",")
for tag in tags:
if (tag.startswith("dataproc_job_") and not tag.startswith("dataproc_job_attempt_timestamp_")):
job_id = tag.split("_")[2]
break
return job_id
# Simply call to the function to output the dataproc jobId
extract_jobid(sc)