points_distribution.py

# -*- coding: utf-8 -*-
"""points_distribution.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1RUYKMWVGgZyBpGsr4vNjBH9j15I9kSJC
"""

import pandas as pd
import numpy as np
from shapely.wkt import loads
from sklearn.cluster import KMeans
import os

def read_csv(filepath):
    """Reads a CSV file into a DataFrame."""
    try:
        return pd.read_csv(filepath)
    except FileNotFoundError:
        print("File not found. Please check the file path and try again.")
    except Exception as e:
        print(f"An error occurred: {e}")
    return None

def save_to_pickle(data, filepath):
    """Saves the DataFrame to a pickle file."""
    dir_name = os.path.dirname(filepath)
    if dir_name:
        os.makedirs(dir_name, exist_ok=True)

    try:
        data.to_pickle(filepath)
        print(f"Data saved to {filepath}")
    except Exception as e:
        print(f"An error occurred while saving to pickle: {e}")

def select_uniform_points(points_wkt, n_clusters):
    """Selects uniform points from a list of WKT strings."""
    points = [loads(wkt) for wkt in points_wkt]
    coords = np.array([point.coords[0] for point in points])
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    kmeans.fit(coords)
    centroids = kmeans.cluster_centers_

    uniform_points = []
    for centroid in centroids:
        distances = np.sqrt(((coords - centroid) ** 2).sum(axis=1))
        closest_point_idx = np.argmin(distances)
        uniform_points.append(points[closest_point_idx])
    return uniform_points

def process_data(data, n, m):
    """Processes the data to select top n points and m uniform points for each GEOID_12."""
    new_rows = []
    groups = data.groupby('GEOID_12')
    for name, group in groups:
        top_n = group.nlargest(n, 'Score')
        remaining_points_wkt = group.loc[~group.index.isin(top_n.index), 'geometry'].tolist()
        uniform_points = select_uniform_points(remaining_points_wkt, m)
        uniform_df = pd.DataFrame({
            'GEOID_12': [name] * len(uniform_points),
            'Score': [group.loc[group['geometry'] == point.wkt, 'Score'].values[0] for point in uniform_points],
            'geometry': [point.wkt for point in uniform_points]
        })
        combined = pd.concat([top_n, uniform_df])
        new_rows.append(combined)
    return pd.concat(new_rows).reset_index(drop=True)

def merge_dataframes(data1, data2, key):
    """Merges two dataframes on a specified key."""
    # Ensure only necessary columns are included
    data1 = data1[[key, 'Score', 'geometry']]
    data2.rename(columns={'geometry': 'polygon_geometry'}, inplace=True)
    data2 = data2[[key, 'polygon_geometry']]
    return data1.merge(data2, on=key, how='left')


def generate_random_points(path_to_random_points="data/filtered_new_point_data_v1.csv", path_to_polygon="data/filtered_gdf_v2.csv", top_n=10, uniform_n=20, path_to_save_output="data/final_points.pkl"):
    # Main execution flow
    # file_path_data = input("Enter the path to your data CSV file: ").strip()
    file_path_data = path_to_random_points  # filtered_new_point_data_v1.csv
    data = read_csv(file_path_data)
    if data is None or data.empty:
        exit()

    # n = int(input("Please enter the value of n for top n points: "))
    n = top_n
    # m = int(input("Please enter the value of m for uniformly distributed points: "))
    m = uniform_n

    print("[INFO] Processing Data.. ")
    processed_data = process_data(data, n, m)

    # file_path_polygon = input("Enter the path to your polygon data CSV file: ").strip()
    file_path_polygon = path_to_polygon # filtered_gdf_v2
    
    polygon_data = read_csv(file_path_polygon)
    if polygon_data is None or polygon_data.empty:
        print("Polygon File not found ... Exiting!")
        exit()

    print("[INFO] Merging File.. ")

    merged_data = merge_dataframes(processed_data, polygon_data, 'GEOID_12')

    print("[INFO] Saving File.. ")
    # pickle_file_path = input("Enter the path to save the pickle file (including filename): ").strip()
    pickle_file_path = path_to_save_output
    save_to_pickle(merged_data, pickle_file_path)   
    
    # # Prompt the user for the path to the pickle file
    # pickle_file_path = input("Enter the path to the pickle file: ").strip()

    # # Load the data from the pickle file
    # merged_data = pd.read_pickle(pickle_file_path)

generate_random_points(top_n=3, uniform_n=10)