Izz Hafeez

PACKAGES

				
import requests
import pandas as pd
import geopandas as gpd
import json
from . import sheets
from .utils import *
import itertools
import shapely
import contextily as ctx
from tqdm import tqdm
import matplotlib.pyplot as plt
import fiona

USEFUL FUNCTIONS

                
def postcodeCoords(location):
    
    PURPOSE

    This function uses the google search function to search for a particular location.
    This would return the latitude and longitude of the location, if it goes smoothly.
    This function is quite primitive, as such, I mostly use it to get the locations of postal codes in Singapore, using search parameters like "Singapore 410230"

    PARAMETERS

    location [str]: input the location of search

    OUTPUT

    A tuple containing the lat and the long of the location searched.
    If not found, return (0,0)
    
    try:
        return (float(x) for x in re.findall(r"\d+\.\d+", website(f"https://www.google.com/search?q={"+".join(location.split())}+coordinates").html.find(class_="BNeawe iBp4i AP7Wnd").string))
    except:
        return (0,0)


def latlong(location):
    
    PURPOSE

    This is the more sophisticated version of the location search function, as it uses official Singapore government services.
    However, it of course is limited to the Singapore search area.

    PARAMETERS

    location [str]: input the location of search

    OUTPUT

    A tuple containing the lat and the long of the location searched.
    If not found, return (0,0)
    
    try:
        coords = json.loads(requests.get(f"https://developers.onemap.sg/commonapi/search?searchVal={'+'.join(location.split())}&returnGeom=Y&getAddrDetails=Y").text)["results"][0]
        lat, long = float(coords["LATITUDE"]), float(coords["LONGITUDE"])
        return (lat,long)
    except:
        print(location,"could not be found.")
        return (0,0)

MRT

                
def getMRT():
    
    PURPOSE (NO LONGER WORKS FULLY)

    I made this code to extract all the kinds of data you'd want to know about MRT and LRT stations.
    More specific information will be given in comments below but as of now, here is the link to the dataset.
    Do note that this is not the full version of the dataset, as I wanted to only keep the essentials in there.

    Download data for train station exits: TrainStationExit06032020.shp
    Download data for train stations: MRTLRTStnPtt.shp
    Download data for OriginDestinationTrain: origin_destination_train_202103.csv
    
    path_to_TrainStationExits = ""
    path_to_TrainStations = ""
    path_to_OriginDestinationTrain = ""


    print("Extracting Wikipedia links of MRT and LRT stations.")
    
    PURPOSE

    DOES NOT WORK NOW BECAUSE WIKIPEDIA CHANGED THE FORMAT OF THE PAGE.
    But what this section does is extract the links to all the Wikipedia pages of the respective MRT and LRT stations.
    Changi Airport, Expo, Tanah Merah and Damai LRT were added manually because they were not found anywhere within the landing page.
    
    stations = website("https://en.wikipedia.org/wiki/List_of_Singapore_MRT_stations")
    stations.getTables()
    stationLinksM = stations.tables[1]["Links"].apply(lambda x: x.split("\n")[0]).tolist()
    stationLinksM = [x for x in stationLinksM if ("a"+x)[-1] == "n"]
    stationLinksM.extend(["https://en.wikipedia.org/wiki/Changi_Airport_MRT_station",
                         "https://en.wikipedia.org/wiki/Expo_MRT_station",
                         "https://en.wikipedia.org/wiki/Tanah_Merah_MRT_station"])

    stations = website("https://en.wikipedia.org/wiki/List_of_Singapore_LRT_stations")
    stations.getTables()
    stationLinksL = stations.tables[0]["Links"].apply(lambda x: x.split("\n")[0]).tolist()
    stationLinksL = [x for x in stationLinksL if ("a"+x)[-1] == "n"]
    stationLinksL.extend(["https://en.wikipedia.org/wiki/Damai_LRT_station_(Singapore)"])

    stationLinks = list(set(stationLinksM + stationLinksL))


    print("Extracting data from Wikipedia pages.")
    
    PURPOSE

    Each page within the links has many key information scattered in various places.
    It was a challenge to make sure all the information from every page was captured, even if the data is inconsistently placed.
    The whole section below is to extract:
        Station Labels (NS26, EW14)
        Station Name (Raffles Place)
        Chinese Translation (莱佛士坊)
        Address (5 Raffles Place)
        Postal Code (048618)
        Latitude (Derived from 1°17′1.97″)
        Longitude (Derived from 103°51′5.52″)
    
    mrt = []

    for link in tqdm(stationLinks):
        site = website(link)
        try:
            stationName = site.html("title")[0].string.split(" MRT")[0]
        except Exception as e:
            print("stationName gave the error: " + str(e) + "\n" + link)
            stationName = None
        
        try:
            stationDetails = list(site.html.find(class_="fn org").stripped_strings)
            chineseIndex = 0
            for detail in stationDetails:
                if re.compile('[\u4e00-\u9fff]+').search(detail):
                    chineseIndex = stationDetails.index(detail)
            stationLabels = ", ".join(stationDetails[:chineseIndex-1])
            stationChinese = stationDetails[chineseIndex]
        except Exception as e:
            print("stationLabels gave the error: " + str(e) + "\n" + link)
            stationLabels = None
            stationChinese = None
        
        try:
            lat = convertCoords(*re.findall("[0-9.]+", site.html(class_="latitude")[0].text))
            long = convertCoords(*re.findall("[0-9.]+", site.html(class_="longitude")[0].text))
        except Exception as e:
            print("latlong gave the error: " + str(e) + "\n" + link)
            lat = None
            long = None

        try:
            fullAddress = list(site.html(class_="infobox-data")[0].strings)
            address = fullAddress[0]
            postcode = re.findall(r"\d{6}", fullAddress[1])[0]
        except Exception as e:
            print("address gave the error: " + str(e) + "\n" + link)
            address = None
            postcode = None
        mrt.append({"Label": stationLabels, "Name": stationName, "Chinese": stationChinese, "Address": address, "Postcode": postcode, "Lat": lat, "Long": long, "Link": link})


    print("Mapping station names to their abbreviations.")
    
    PURPOSE

    The abbreviations to the MRT and LRT stations are captured in the site listed below.
    So it was a matter of extracting them and mapping them to the existing data.
    
    site = website("https://en.wikipedia.org/wiki/List_of_Singapore_MRT_stations")
    abbrMapping = {}
    for t in site.html("td",text=re.compile("^[A-Z]{3}$")):
        if t.text != "TBA":
            abbrMapping[t.find_previous("a",href=True).text.strip()] = t.text

    def tryAbb(x):
        try: return abbrMapping[x]
        except: return ""

    mrt["Abbreviation"] = mrt.Name.apply(tryAbb)

    mrt = pd.DataFrame(mrt)


    print("Finding coordinates of exits of MRT and LRT stations.")
    
    PURPOSE

    As of this point, the information gathered is more than enough for my purposes.
    However, just to finish things off, the following steps take things a whole step further.
    With the help of a couple of datasets from LTA Data Mall, we can now know:
        The locations of the exits to the existing MRT and LRT stations.
        The number of passengers to and from stations.

    I also did a mapping from the labels of the staitons (NS26, EW14) into their respective colors.
    
    exits = gpd.read_file(path_to_TrainStationExits)
    exits["STN_NAME"] = exits["STN_NAME"].str.replace(" .RT STATION","").str.replace(" STATION","")
    exits = exits[["STN_NAME","EXIT_CODE","geometry"]]
    exits.columns = ["Name","Exit","geometry"]

    stations = gpd.read_file(path_to_TrainStations)
    stations["geometry"] = stations.geometry.to_crs(epsg=4326)
    stations["STN_NAME"] = stations["STN_NAME"].str.replace(" .RT STATION","").str.replace(" STATION","")
    stations = stations[["STN_NAME","STN_NO","geometry"]]
    stations.columns = ["Name","Exit","geometry"]

    mrtFull = mrt.copy()
    mrtFull["Name"] = mrtFull.Name.str.upper().str.replace(" .RT STATION.*$","")
    mrtFull = pd.concat([mrtFull.merge(stations, how="left"),
                         mrtFull.merge(exits, how="left")])

    mrtFull["Exit"] = mrtFull[["Exit","Label"]].apply(lambda x: x[1] if pd.isna(x[0]) else x[0], axis=1)
    mrtFull = gpd.GeoDataFrame(mrtFull, geometry="geometry")
    mrtFull["Labels"] = mrtFull.Label.copy()
    mrtFull["Label"] = mrtFull["Exit"]
    mrtFull = mrtFull.drop("Exit",axis=1)

    mrtFull = mrtFull[["Label","Name","Chinese","Address","Postcode","Lat","Long","Labels","geometry","Link"]]
    mrtFull["Name"] = mrtFull.Name.str.replace(" LRT.*?$","")
    mrtFull["Long"] = mrtFull[["geometry","Long"]].apply(lambda x: x[1] if pd.isna(x[0]) else x[0].x, axis=1)
    mrtFull["Lat"] = mrtFull[["geometry","Lat"]].apply(lambda x: x[1] if pd.isna(x[0]) else x[0].y, axis=1)

    mrtFull["Label"] = mrtFull.Label.str.split("[,/]")
    mrtFull = gpd.GeoDataFrame(pd.DataFrame(mrtFull).explode("Label"))
    mrtFull["Label"] = mrtFull.Label.str.strip()
    mrtFull["line"] = mrtFull.Label.str.extract(r"(\D+)").fillna(0)
    mrtFull = mrtFull[mrtFull.line != 0]

    passengers = pd.read_csv(path_to_OriginDestinationTrain)
    passengers.ORIGIN_PT_CODE = passengers.ORIGIN_PT_CODE.str.split("/")
    passengers.DESTINATION_PT_CODE = passengers.DESTINATION_PT_CODE.str.split("/")
    passengers = passengers.explode("ORIGIN_PT_CODE")
    passengers = passengers.explode("DESTINATION_PT_CODE")
    passengers.ORIGIN_PT_CODE = passengers.ORIGIN_PT_CODE.str.strip()
    passengers.DESTINATION_PT_CODE = passengers.DESTINATION_PT_CODE.str.strip()

    mrtFull = mrtFull.merge(pd.DataFrame(passengers.groupby("ORIGIN_PT_CODE").pipe(lambda x: x.TOTAL_TRIPS.sum())),
                            left_on="Label", right_on="ORIGIN_PT_CODE", how="left")
    mrtFull = mrtFull.merge(pd.DataFrame(passengers.groupby("DESTINATION_PT_CODE").pipe(lambda x: x.TOTAL_TRIPS.sum())),
                            left_on="Label", right_on="DESTINATION_PT_CODE", how="left")
    mrtFull = mrtFull.rename(columns={"TOTAL_TRIPS_x":"Origin", "TOTAL_TRIPS_y":"Destination"})

    colorMap = {"EW":"#009645",
               "CG":"#009645",
               "NS":"#D42E12",
               "NE":"#9900AA",
               "CC":"#FA9E0D",
               "CE":"#FA9E0D",
               "DT":"#005EC4",
               "TE":"#9D5B25",
               "JE":"#0099AA",
               "JS":"#0099AA",
               "JW":"#0099AA",
               "CR":"#97C616",
               "CP":"#97C616",
               "PE":"#748477",
               "PW":"#748477",
               "PT":"#748477",
               "ST":"#748477",
               "SE":"#748477",
               "SW":"#748477",
               "BP":"#748477",
               "RT":"#86CEEB"}

    mrtFull["color"] = mrtFull.Label.apply(lambda x: colorMap[x[:2]] if x[:2] in colorMap.keys() else None)
    mrtFull["number"] = mrtFull.Label.str.extract(r"(\d+)").fillna("1000")
    mrtFull["number"] = mrtFull.number.apply(int)
    mrtFull["Type"] = mrtFull.line.apply(lambda x: "Station" if len(x) > 1 else "Exit")

    mrtFull = mrtFull[['Label', 'Name', 'Chinese', 'Address', 'Postcode', 'Lat', 'Long', 
                       'Labels', 'Type', 'Origin', 'Destination', 'geometry', 'color', 'line', 'number', 'Link']]
        
    return(mrt, mrtFull)

BUS

                
def getBus(accountKey):
    
    PURPOSE

    I made this code to extract all the kinds of data you'd want to know about bus services and stations:
        Service No
        Operator
        BusStopCode
        Distance to next stop
        Road Name
        Description
        Latitude
        Longitude
        Altitude
        Passenger Inflow
        Passenger Outflow

    Here is the link to the dataset.

    Download data for OriginDestinationBus: origin_destination_bus_202103.csv
    
    path_to_OriginDestinationBus = ""


    busRoutes = []
    starting = 0
    while True:
        try:
            headers = {'AccountKey':accountKey}
            results = requests.get('http://datamall2.mytransport.sg/ltaodataservice/BusRoutes?$skip=' + str(starting),headers=headers).text
            results = json.loads(results)
            if len(results['value']) == 0:
                break
            busRoutes.extend(results['value'])
            starting += 500
        except: break

    busStops = []
    starting = 0
    while True:
        try:
            headers = {'AccountKey':accountKey}
            results = requests.get('http://datamall2.mytransport.sg/ltaodataservice/BusStops?$skip=' + str(starting),headers=headers).text
            results = json.loads(results)
            if len(results['value']) == 0:
                break
            busStops.extend(results['value'])
            starting += 500
        except: break

    bus = pd.DataFrame(busRoutes).merge(pd.DataFrame(busStops))

    passengers = pd.read_csv(path_to_OriginDestinationBus)
    tapIn = passengers.groupby("PT_CODE").pipe(lambda x: x.TOTAL_TAP_IN_VOLUME.sum())
    tapOut = passengers.groupby("PT_CODE").pipe(lambda x: x.TOTAL_TAP_OUT_VOLUME.sum())
    passengers = pd.DataFrame([tapIn, tapOut]).T
    passengers = passengers.reset_index()
    passengers.columns = ["BusStopCode","In","Out"]

    return bus.merge(passengers)

ROADS

                
def getRoads():
    
    PURPOSE

    I made this code to extract combine road data with planning area data.
    It is used to find roads within any particular planning area.

    Download data for road network: road-network.kml
    Download data for planning areas: Subzone_Census2010.kml
    
    path_to_RoadNetwork = ""
    path_to_PlanningAreas = ""


    gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'
    roads = gpd.read_file(path_to_RoadNetwork, driver='KML')
    roads["Name"] = roads.Description.str.extract("<td>(.*?)</td>")
    roads["Type"] = roads.Description.str.extract("RD_TYP_CD</th>.*?<td>(.*?)</td>")
    roads = roads.drop("Description",axis=1)

    planning = gpd.read_file(path_to_PlanningAreas, driver='KML')
    planning["SubzoneCode"] = planning.Description.str.extract("Subzone Code.*?<td>(.*?)</td>")
    planning["Planning"] = planning.Description.str.extract("Planning Area Name.*?<td>(.*?)</td>")
    planning["PlanningCode"] = planning.Description.str.extract("Planning Area Code.*?<td>(.*?)</td>")
    planning["Region"] = planning.Description.str.extract("Region Name.*?<td>(.*?)</td>")
    planning["RegionCode"] = planning.Description.str.extract("Region Code.*?<td>(.*?)</td>")
    planning = planning.rename(columns={"Name":"Subzone"})
    planning = planning.drop("Description",axis=1)
    planning = planning[['Region', 'RegionCode', 'Planning', 'PlanningCode', 'Subzone', 'SubzoneCode', 'geometry']]

    roadsm = roads.values.tolist()
    planningm = planning.values.tolist()

    roads = []
    for road, line, rType in tqdm(roadsm):
        appeared = False
        for *details, shape in planningm:
            if line.within(shape) or line.intersects(shape):
                roads.append((road, rType, line) + tuple(details))
                appeared = True
        if not appeared:
            roads.append((road, rType, line) + ("",)*6)

    roads = gpd.GeoDataFrame(roads)
    roads.columns = ["Road", "Type", "geometry", "Region", "RegionCode", "Planning", "PlanningCode", "Subzone", "SubzoneCode"]

    
    PURPOSE

    Visualisation of roads in Singapore.
    
    ax = roads[roads.Region != ""].plot(figsize=(15,15), column="Subzone")
    ctx.add_basemap(ax=ax, zoom=13, crs="EPSG:4326")

ALTITUDE

                
def generateElevationMap():
    
    PURPOSE

    To estimate the altitude of a particular location, you need to preprocess the contour lines first.
    That's what this code is for.
    Be sure to save the output of this code as a variable, as it will be used for the actual altitude code.

    Download national_map_line data: national-map-line-geojson.geojson
    

    path_to_NationalMapLine = ""


    elevation = gpd.read_file(path_to_NationalMapLine)
    elevation["Location"] = elevation.Description.str.extract("<td>(.*?)</td>")

    elevation = elevation[elevation.Location.str.contains(r"^\d+$")]
    L = []
    for geometry in elevation.geometry.tolist():
        try:
            L.append(shapely.geometry.Polygon([(float(i[0]), float(i[1])) for i in re.findall(r"(\d+\.\d+)\s(\d+\.\d+)", str(geometry))]))
        except:
            L.append("")
    elevation["polygon"] = L
    elevation = elevation[elevation.polygon != ""]

    multipolygons = []
    for location in elevation.Location.unique():
        multipolygons.append({"alt": int(location), "geometry":shapely.geometry.MultiPolygon(elevation[elevation.Location == location].polygon.tolist())})
        
    return gpd.GeoDataFrame(multipolygons)


def alt(coords, elevationMap):
    
    PURPOSE

    To find the altitude (or elevation) of a particular point in Singapore.
    The way this code works is just finding the distance between the point and the two closest contour lines to it.
    Then, by some formula (can see below), an estimation is made.

    PARAMETERS

    elevationMap: obtained when running the previous code, generateElevationMap()
    coords [tuple of longitude and latitude]
    
    closestDistances = {x: 1000 for x in elevationMap.alt.unique()}
    maxalt = 0

    for polygon, alt in elevationMap[["polygon","Location"]].values.tolist():
        point = coords
        distance = polygon.exterior.distance(point)
        closestDistances[int(alt)] = min(closestDistances[int(alt)], distance)
        if point.within(polygon) and maxalt < int(alt):
            maxalt = int(alt)

    return ((maxalt + 20) * closestDistances[maxalt] + (maxalt) * closestDistances[maxalt + 20]) / (closestDistances[maxalt] + closestDistances[maxalt + 20])

PARKING

                
def getParkingLots(accountKey):
    
    PURPOSE

    Gets the number of Parking lots for particular locations.
    
    parking = []
    starting = 0
    while True:
        try:
            headers = {'AccountKey':accountKey}
            results = requests.get('http://datamall2.mytransport.sg/ltaodataservice/CarParkAvailabilityv2?$skip=' + str(starting),headers=headers).text
            results = json.loads(results)
            if len(results['value']) == 0:
                break
            parking.extend(results['value'])
            starting += 500
        except: break

    return pd.DataFrame(parking)