Export External Tables from Azure Databricks Workspace
Hello!
I recently needed to export the “CREATE” statements for any hive tables on an Azure Databricks Instance whose paths were set externally to an Azure Datalake. Other htan changing the output_folder and hte name(s) of your mount points below this script should run on any Workspace. There is a little work to be done with regards to the order of objects saved to the csv files, as views require the tables to be created first before they can run, but seeing as I only had to run this once I manually re-ran the small number of views that failed.
import re
from pyspark.sql.types import StringType
create_statements = []
phantom_tables = []
unmanaged_tables = []
output_folder = 'datalakelocationhere'
databases = sqlContext.sql("SHOW DATABASES").collect()
for database in databases:
print(f"--database: {database[0]}")
create_database = f"CREATE DATABASE IF NOT EXISTS {database[0]}"
create_statements.append(create_database)
for table in [table for table in (sqlContext.sql(f"SHOW TABLES IN {database[0]}").collect())]:
print(f"----table: {table[0]}.{table[1]}")
try:
show_create_table = f"SHOW CREATE TABLE {table[0]}.{table[1]}"
create_table = sqlContext.sql(show_create_table).collect()[0][0]
except NoSuchTableException:
print(f"--Table {table[0]}.{table[1]} not found.")
if create_table.startswith('CREATE TABLE') == True:
create_table = create_table .replace("CREATE TABLE", "CREATE TABLE IF NOT EXISTS")
elif create_table.startswith('CREATE EXTERNAL TABLE') == True:
create_table = create_table .replace("CREATE EXTERNAL TABLE", "CREATE EXTERNAL TABLE IF NOT EXISTS")
elif create_table.startswith('CREATE VIEW') == True:
create_table = create_table .replace("CREATE VIEW", "CREATE OR REPLACE VIEW")
elif create_table.startswith('CREATE GLOBAL VIEW') == True:
create_table = create_table .replace("CREATE GLOBAL VIEW", "CREATE OR REPLACE GLOBAL VIEW")
elif create_table.startswith('CREATE TEMPORARY VIEW') == True:
create_table = create_table .replace("CREATE TEMPORARY VIEW", "CREATE OR REPLACE TEMPORARY VIEW")
else:
print("Not managed table type.")
unmanaged_tables.append(f'{create_table}')
path = (re.findall(r"/mnt/(lake|vault)/", create_table)) #our hive tables are saved to mount points that begin with these names
if path:
create_statements.append(f'{create_table}')
else:
print(f"*** not on data lake {table[0]}.{table[1]} ***")
phantom_tables.append(f'{create_table}')
create_statements_df = spark.createDataFrame(create_statements, StringType())
file = output_folder + f'{database[0]}'
create_statements_df.coalesce(1).write.format("csv").mode('overwrite').save(file)
create_statements.clear()
if phantom_tables:
print("Saving tables not stored on datalake")
phantom_tables_df = spark.createDataFrame(phantom_tables, StringType())
phantom_file = output_folder + f'phantom'
phantom_tables_df.coalesce(1).write.format("csv").mode('overwrite').save(phantom_file)
if unmanaged_tables:
print("Listing tables that are not currently managed by this script.")
unmanaged_tables_df = spark.createDataFrame(unmanaged_tables, StringType())
unmanaged_file = output_folder + f'unmanaged'
unmanaged_tables_df.coalesce(1).write.format("csv").mode('overwrite').save(phantom_file)