Additional explanation of your data and data collection process;
Outline your analytical procedure:
You can organize them into subsections and in each subsection, please document the major analytical results. Please also provide brief discussions if any.
group project, data modeling prediction resultmode, regressionclassification, meaningresult, linear regression,
dataset, predictive modeling result, resultpeak season compare;
Dataset download: https:www.kaggle.comusdotflightdelaysflights.csv
Data Cleansing
Load data from airports.csv, flights.csv, airlines.csv by using pandas
import pandas as pd
import numpy as np
import seaborn as sns
import chartstudio.plotly as py
import plotly.graphobjs as go
from plotly import tools
import matplotlib.pyplot as plt
sns.setstylewhitegrid
matplotlib inline
import datetime, warnings, scipy
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.patches import ConnectionPatch
from collections import OrderedDict
from matplotlib.gridspec import GridSpec
from mpltoolkits.basemap import Basemap
from scipy.optimize import curvefit
plt.rcParamspatch.forceedgecolor True
plt.style.usefivethirtyeight
mpl.rcpatch, edgecolor dimgray, linewidth1
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.astnodeinteractivity lastexpr
pd.options.display.maxcolumns 50
warnings.filterwarningsignore
Load tables
airlines pd.readcsv..airlines.csv
airports pd.readcsv..airports.csv
flights pd.readcsv..flights.csv, lowmemoryFalse,encodingutf8.head100000
printflights.shape
flights.DEPARTURETIME.head10
flightsDATE pd.todatetimeflightsYEAR,MONTH, DAY
def getdatetimefromstringtime:
if pd.isnulltime:
return np.nan
else:
if time 2400: time 0
time 0:04d.formatinttime
time datetime.timeinttime0:2, inttime2:4
time datetime.datetime.strptimetime, HM.time.strftimeHM
return time
def getdatetimex:
if pd.isnullx0 or pd.isnullx1:
return np.nan
else:
return datetime.datetime.combinex0,x1
def getflighttimeflights, col:
delaylist
for index, cols in flightsDATE, col.iterrows:
if pd.isnullcols1:
delaylist.appendnp.nan
elif floatcols1 2400:
cols0 datetime.timedeltadays1
cols1 datetime.time0,0
delaylist.appendgetdatetimecols
else:
cols1 getdatetimefromstringcols1
delaylist.appendgetdatetimecols
return pd.Seriesdelaylist
flightsSCHEDULEDDEPARTURE getflighttimeflights, SCHEDULEDDEPARTURE
flightsSCHEDULEDARRIVAL getflighttimeflights, SCHEDULEDARRIVAL
delays pd.mergeflights, airlines, leftonAIRLINE, rightonIATACODE, howleft
remove neccessary variable depends on your need
variablestoremove TAXIOUT, TAXIIN, WHEELSON, WHEELSOFF, YEAR,
MONTH,DAY,DAYOFWEEK,DATE, AIRSYSTEMDELAY,
SECURITYDELAY, AIRLINEDELAY, LATEAIRCRAFTDELAY,
WEATHERDELAY, DIVERTED, CANCELLED, CANCELLATIONREASON,
FLIGHTNUMBER, TAILNUMBER, AIRTIME
delays.dropvariablestoremove, axis 1, inplace True
delays delaysAIRLINEy, ORIGINAIRPORT, DESTINATIONAIRPORT,
SCHEDULEDDEPARTURE, DEPARTURETIME, DEPARTUREDELAY,
SCHEDULEDARRIVAL, ARRIVALTIME, ARRIVALDELAY,
SCHEDULEDTIME, ELAPSEDTIME
delays:5
delays delays.renamecolumnsAIRLINEy:AIRLINE
Data Description:
IATACODE object
AIRLINE object
dtype: object
14, 2
IATACODE object
AIRPORT object
CITY object
STATE object
COUNTRY object
LATITUDE float64
LONGITUDE float64
dtype: object
322, 7
YEAR int64
MONTH int64
DAY int64
DAYOFWEEK int64
AIRLINE object
FLIGHTNUMBER int64
TAILNUMBER object
ORIGINAIRPORT object
DESTINATIONAIRPORT object
SCHEDULEDDEPARTURE int64
DEPARTURETIME float64
DEPARTUREDELAY float64
TAXIOUT float64
WHEELSOFF float64
SCHEDULEDTIME float64
ELAPSEDTIME float64
AIRTIME float64
DISTANCE int64
WHEELSON float64
TAXIIN float64
SCHEDULEDARRIVAL int64
ARRIVALTIME float64
ARRIVALDELAY float64
DIVERTED int64
CANCELLED int64
CANCELLATIONREASON object
AIRSYSTEMDELAY float64
SECURITYDELAY float64
AIRLINEDELAY float64
LATEAIRCRAFTDELAY float64
WEATHERDELAY float64
dtype: object
5819079, 31