From 068b44f2cc5f586cbf337d0eba168d7edc88d4aa Mon Sep 17 00:00:00 2001 From: Leonard Eshun Date: Fri, 13 Dec 2024 23:18:43 +0000 Subject: [PATCH] etl --- country_capital.json | 196 ------------------------- databricks_scripts/extract.py | 1 + databricks_scripts/query.py | 12 +- databricks_scripts/transform_n_load.py | 1 + 4 files changed, 12 insertions(+), 198 deletions(-) delete mode 100644 country_capital.json diff --git a/country_capital.json b/country_capital.json deleted file mode 100644 index 0611a05..0000000 --- a/country_capital.json +++ /dev/null @@ -1,196 +0,0 @@ - - { "afghanistan": "Kabul", - "albania": "Tirana", - "algeria": "Algiers", - "andorra": "Andorra la Vella", - "angola": "Luanda", - "antigua and barbuda": "Saint John's", - "argentina": "Buenos Aires", - "armenia": "Yerevan", - "australia": "Canberra", - "austria": "Vienna", - "azerbaijan": "Baku", - "bahamas": "Nassau", - "bahrain": "Manama", - "bangladesh": "Dhaka", - "barbados": "Bridgetown", - "belarus": "Minsk", - "belgium": "Brussels", - "belize": "Belmopan", - "benin": "Porto-Novo", - "bhutan": "Thimphu", - "bolivia": "Sucre", - "bosnia and herzegovina": "Sarajevo", - "botswana": "Gaborone", - "brazil": "Brasilia", - "brunei": "Bandar Seri Begawan", - "bulgaria": "Sofia", - "burkina faso": "Ouagadougou", - "burundi": "Gitega", - "cabo verde": "Praia", - "cambodia": "Phnom Penh", - "cameroon": "Yaoundé", - "canada": "Ottawa", - "central african republic": "Bangui", - "chad": "N'Djamena", - "chile": "Santiago", - "china": "Beijing", - "colombia": "Bogotá", - "comoros": "Moroni", - "congo, democratic republic of the": "Kinshasa", - "congo, republic of the": "Brazzaville", - "costa rica": "San José", - "croatia": "Zagreb", - "cuba": "Havana", - "cyprus": "Nicosia", - "czech republic": "Prague", - "denmark": "Copenhagen", - "djibouti": "Djibouti", - "dominica": "Roseau", - "dominican republic": "Santo Domingo", - "ecuador": "Quito", - "egypt": "Cairo", - "el salvador": "San Salvador", - "equatorial guinea": "Malabo", - "eritrea": "Asmara", - "estonia": "Tallinn", - "eswatini": "Mbabane", - "ethiopia": "Addis Ababa", - "fiji": "Suva", - "finland": "Helsinki", - "france": "Paris", - "gabon": "Libreville", - "gambia": "Banjul", - "georgia": "Tbilisi", - "germany": "Berlin", - "ghana": "Accra", - "greece": "Athens", - "grenada": "Saint George's", - "guatemala": "Guatemala City", - "guinea": "Conakry", - "uinea-Bbissau": "Bissau", - "guyana": "Georgetown", - "haiti": "Port-au-Prince", - "honduras": "Tegucigalpa", - "hungary": "Budapest", - "iceland": "Reykjavik", - "india": "New Delhi", - "indonesia": "Jakarta", - "iran": "Tehran", - "iraq": "Baghdad", - "ireland": "Dublin", - "israel": "Jerusalem", - "italy": "Rome", - "jamaica": "Kingston", - "japan": "Tokyo", - "jordan": "Amman", - "kazakhstan": "Astana", - "kenya": "Nairobi", - "kiribati": "South Tarawa", - "kuwait": "Kuwait City", - "kyrgyzstan": "Bishkek", - "laos": "Vientiane", - "latvia": "Riga", - "lebanon": "Beirut", - "lesotho": "Maseru", - "liberia": "Monrovia", - "libya": "Tripoli", - "liechtenstein": "Vaduz", - "lithuania": "Vilnius", - "luxembourg": "Luxembourg", - "madagascar": "Antananarivo", - "malawi": "Lilongwe", - "malaysia": "Kuala Lumpur", - "maldives": "Malé", - "mali": "Bamako", - "malta": "Valletta", - "marshall islands": "Majuro", - "mauritania": "Nouakchott", - "mauritius": "Port Louis", - "mexico": "Mexico City", - "micronesia": "Palikir", - "moldova": "Chisinau", - "monaco": "Monaco", - "mongolia": "Ulaanbaatar", - "montenegro": "Podgorica", - "morocco": "Rabat", - "mozambique": "Maputo", - "myanmar": "Naypyidaw", - "namibia": "Windhoek", - "nauru": "No official capital", - "nepal": "Kathmandu", - "netherlands": "Amsterdam", - "new Zzealand": "Wellington", - "nicaragua": "Managua", - "niger": "Niamey", - "nigeria": "Abuja", - "north korea": "Pyongyang", - "north macedonia": "Skopje", - "norway": "Oslo", - "oman": "Muscat", - "pakistan": "Islamabad", - "palau": "Ngerulmud", - "panama": "Panama City", - "papua new guinea": "Port Moresby", - "paraguay": "Asunción", - "peru": "Lima", - "philippines": "Manila", - "poland": "Warsaw", - "portugal": "Lisbon", - "qatar": "Doha", - "romania": "Bucharest", - "russia": "Moscow", - "rwanda": "Kigali", - "saint kitts and nevis": "Basseterre", - "saint lucia": "Castries", - "saint vincent and the grenadines": "Kingstown", - "samoa": "Apia", - "san marino": "San Marino", - "sao tome and principe": "São Tomé", - "saudi arabia": "Riyadh", - "senegal": "Dakar", - "serbia": "Belgrade", - "seychelles": "Victoria", - "sierra leone": "Freetown", - "singapore": "Singapore", - "slovakia": "Bratislava", - "slovenia": "Ljubljana", - "solomon islands": "Honiara", - "somalia": "Mogadishu", - "south africa": "Pretoria", - "south korea": "Seoul", - "south sudan": "Juba", - "spain": "Madrid", - "sri lanka": "Sri Jayawardenepura Kotte", - "sudan": "Khartoum", - "suriname": "Paramaribo", - "sweden": "Stockholm", - "switzerland": "Bern", - "syria": "Damascus", - "taiwan": "Taipei", - "tajikistan": "Dushanbe", - "tanzania": "Dodoma", - "thailand": "Bangkok", - "timor-Leste": "Dili", - "togo": "Lomé", - "tonga": "Nuku'alofa", - "trinidad and tobago": "Port of Spain", - "tunisia": "Tunis", - "turkey": "Ankara", - "turkmenistan": "Ashgabat", - "tuvalu": "Funafuti", - "uganda": "Kampala", - "ukraine": "Kyiv", - "united urab emirates": "Abu Dhabi", - "united kingdom": "London", - "united states": "Washington D.C.", - "uruguay": "Montevideo", - "uzbekistan": "Tashkent", - "vanuatu": "Port Vila", - "vatican city": "Vatican City", - "venezuela": "Caracas", - "vietnam": "Hanoi", - "yemen": "Sana'a", - "zambia": "Lusaka", - "zimbabwe": "Harare" -} \ No newline at end of file diff --git a/databricks_scripts/extract.py b/databricks_scripts/extract.py index 0555207..1e5662f 100644 --- a/databricks_scripts/extract.py +++ b/databricks_scripts/extract.py @@ -25,6 +25,7 @@ def do_extract(): extract( "https://data.cityofnewyork.us/resource/c3uy-2p5r.csv?$limit=200000", "air_quality.csv", + on_databricks=True, ) log_tests("Testing if CSV file exists...") diff --git a/databricks_scripts/query.py b/databricks_scripts/query.py index ade6d03..9a31895 100644 --- a/databricks_scripts/query.py +++ b/databricks_scripts/query.py @@ -24,11 +24,19 @@ """ -def query(): +def query(on_databricks=False): result_df = execute_read_query(script_to_execute) result_df.show() save_output(result_df.toPandas().to_markdown()) - result_df.write.csv("/tmp/Aggregation_Query_Result", header=True, mode="overwrite") + result_df.write.csv( + ( + "/Workspace/Workspace/Shared/Leonard_Eshun_Mini_Project_Eleven/data/" + if on_databricks + else "./Aggregation_Query_Result" + ), + header=True, + mode="overwrite", + ) if __name__ == "__main__": diff --git a/databricks_scripts/transform_n_load.py b/databricks_scripts/transform_n_load.py index 5020afb..cf8998e 100644 --- a/databricks_scripts/transform_n_load.py +++ b/databricks_scripts/transform_n_load.py @@ -48,6 +48,7 @@ def transform_and_load(): "fn_geo_id": 6, "fn_indicator_id": 1, }, + on_databricks=True, ) log_tests("Transform and Load Test Successful", last_in_group=True) print("Transform and Load Test Successful")