Appendix B - YAML Files

Table of contents

  1. Main Configuration
  2. Calculations Configuration
    1. Link Measures - Corridor Explorer
    2. Stop Measures - Corridor Explorer
    3. Link Measures - BCAP
  3. Data Schemas
    1. Raw Transactions Schema
    2. Transaction Schema
    3. Raw Itinerary

Main Configuration

folder_locations:
  src_path: 'C:/projects/transit_corridor_analytics/3_etl/src'
  raw_folder: "../../1_raw_data/"
  input_folder: "../../2_input_data/"
  schema_folder: "../assets/schema_files"
  config_folder: "../assets/config_files"
  output_folder: "../../4_outputs/"

data_items:
  raw:
    - name: 'itinerary'
      relative_path: '1_hastus/{version}/1_itineraries/{region}_{version}_GTFS_Itineraries.csv'
      schema_file_name: 'hastus_raw_itinerary_schema_{schema_version}.yaml'
      column_names: [ 'shape_id','segment','order','seg_direction','seg_id','segment_order' ]
      processed_folder: '3_hastus/{version}/1_itineraries'
      processed_file_name: '{region}_{version}_GTFS_Itineraries.csv'
      default_variables:
        region: 'SEQ'
        version: '20221014'
        schema_version: '2023.0.0' # if did not have a column header

    - name: 'full_network'
      relative_path: '1_hastus/{version}/2_streetsegment_network'
      processed_folder: '3_hastus/{version}/2_streetsegment_network'
      processed_file_name: 'hastus_network.parquet.gz'
      default_variables:
        version: '202310'

    - name: 'stop_location'
      relative_path: '1_hastus/{version}/3_stops/stop_location.txt'
      processed_folder: '3_hastus/{version}/3_stops'
      processed_file_name: 'stop_location.csv'
      default_variables:
        version: '20221014'

    - name: 'transactions'
      relative_path: '3_ticketing/{version}/transactions_daily_{version}.csv'
      schema_file_name: 'raw_transactions_schema_{schema_version}.yaml'
      partition_by: 'period_date'
      processed_folder: '5_transactions/partitioned'  # When the raw transaction file is processed, the partitioned data is saved under this location. This path is relative to the input folder defined in the folder structure.
      processed_file_name: 'transaction.csv'
      default_variables:
        version: '202310'
        schema_version: '2024.0.0'

    - name: 'trip_stop_timing'
      relative_path: '3_ticketing/{version}/trip_stop_timing_daily_{version}.csv'
      schema_file_name: 'trip_stop_timing_schema_{schema_version}.yaml'
      partition_by: 'period_date'
      processed_folder: '6_trip_stop_timing/partitioned'
      processed_file_name: 'tripstoptiming.csv'
      default_variables:
        version: '202310'
        schema_version: '2024.0.0'

  inputs:

    - name: "ref_transaction_ticket_status"
      relative_path: '1_reference_tables/ref_{version}/ref_ticket_status.csv'
      schema_file_name: "ref_ticket_status_schema_{schema_version}.yaml"
      default_variables:
        version: '2024.0.1'
        schema_version: '2021.0.0'

    - name: "ref_transaction_to_gtfs"
      storage_type: local_drive
      file_type: csv
      is_partitioned: false
      relative_path: '1_reference_tables/ref_{version}/ref_transaction_to_gtfs.csv'
      schema_file_name: "ref_transaction_to_gtfs_schema_{schema_version}.yaml"
      default_variables:
        version: '2024.0.1'
        schema_version: '2021.0.0'

    - name: "ref_trip_stop_timing_to_gtfs"
      storage_type: local_drive
      file_type: csv
      is_partitioned: false
      relative_path: '1_reference_tables/ref_{version}/ref_trip_stop_timing_to_gtfs_{region}.csv'
      schema_file_name: 'ref_trip_stop_timing_to_gtfs_schema_{schema_version}.yaml'
      default_variables:
        version: '2024.0.1'
        schema_version: '2023.0.0'

    - name: "corridor_definition"
      storage_type: local_drive
      file_type: csv
      relative_path: "2_corridor_definition/corridor_definition_{version}/seq_corridors_segments.csv"
      schema_file_name: 'corridor_definition_schema_{schema_version}.yaml'
      default_variables:
        version: "2023.0.0"
        schema_version: '2024.0.0'

    - name: "transactions"
      is_partitioned: true
      relative_path: "5_transactions/partitioned/Year={year}/Month={month}/Day={day}/transaction.parquet.gz"
      schema_file_name: "transactions_schema_{schema_version}.yaml"
      default_variables:
        schema_version: '2023.0.0'

    - name: "trip_stop_timing"
      storage_type: local_drive
      file_type: csv
      is_partitioned: true
      relative_path: "6_trip_stop_timing/partitioned/Year={year}/Month={month}/Day={day}/tripstoptiming.parquet.gz"
      schema_file_name: "trip_stop_timing_schema_{schema_version}.yaml"
      default_variables:
        schema_version: '2023.0.0'

    - name: "full_network"
      storage_type: "local_drive"
      file_type: "parquet"
      requires_region: false
      relative_path: "3_hastus/{version}/2_streetsegment_network/hastus_network.parquet.gz"
      layer_name: 'processed_streetsegment'
      default_variables:
        schema_version: '2023.0.0'

    - name: "itinerary"
      storage_type: "local_drive"
      file_type: "csv"
      requires_region: true
      relative_path: "3_hastus/{version}/1_itineraries/{region}_{version}_GTFS_Itineraries.csv"
      schema_file_name: "hastus_raw_itinerary_schema_{schema_version}.yaml"
      default_variables:
        region: "SEQ"
        version: "20230307"
        schema_version: "2023.0.0"

    - name: "spatial_itinerary"
      storage_type: "local_drive"
      file_type: "parquet"
      requires_region: true
      relative_path: "3_hastus/{version}/4_spatial_itinerary/{region}_{version}_GTFS_Itineraries.parquet.gz"
      schema_file_name: "hastus_spatial_itinerary_schema_{schema_version}.yaml"
      default_variables:
        region: "SEQ"
        version: "20230307"
        schema_version: "2024.0.0"

    - name: "stop_location"
      storage_type: "local_drive"
      file_type: "csv"
      requires_region: true
      relative_path: "3_hastus/{version}/3_stops/stop_location.csv"
      default_variables:
        region: "SEQ"
        version: "20230307"
        schema_version: "2024.0.0"


    - name: "gtfs_feed"
      requires_region: true # Will need a region when we include all the regions in the input folder.
      relative_path: "4_gtfs/2_cubic/{version}/{region}_GTFS.zip"
      default_variables:
        region: "SEQ"
        version: "20230307"

    - name: "feedback"
      relative_path: "7_other_spatial_inputs/bus_operator_feedback.parquet.gz"

    - name: "centers"
      relative_path: "7_other_spatial_inputs/regional_activity_centres.parquet.gz"

    - name: "lga"
      relative_path: "7_other_spatial_inputs/local_government_areas.parquet.gz"

    - name: "roads"
      relative_path: "7_other_spatial_inputs/baseline_roads.parquet.gz"

    - name: "suburbs"
      relative_path: "7_other_spatial_inputs/suburbs.parquet.gz"

    - name: "pt_needs"
      relative_path: "7_other_spatial_inputs/pt_needs_index.parquet.gz"

    - name: "strat_network"
      relative_path: "7_other_spatial_inputs/corridors_shapes.parquet.gz"

    - name: "busway"
      relative_path: "7_other_spatial_inputs/busway_shapes_v3.parquet.gz"

    - name: "cbd"
      relative_path: "7_other_spatial_inputs/brisbane_city.parquet.gz"


  outputs:
    - name: "stop_to_stop_measure"
      storage_type: "local_drive"
      file_type: "parquet"
      requires_region: true
      relative_path: "1_stop_to_stop_outputs/{version}/region={region}/Year={year}/Month={month}/Day={day}/stop_to_stop_measures.parquet.gz"
      is_partitioned: true
      schema_file_name: 'stop_to_stop_measures_schema_{schema_version}.yaml'
      default_variables:
        version: "20221014"
        region: 'SEQ'
        schema_version: '2024.0.0'

    - name: "link_based_measures"
      storage_type: "local_drive"
      file_type: "parquet"
      relative_path: "2_travel_time_split/{version}/region={region}/Year={year}/Month={month}/Day={day}/link_based_measures.parquet.gz"
      requires_region: true
      schema_file_name: "link_based_measures_schema_{schema_version}.yaml"
      is_partitioned: true
      default_variables:
        version: "20221014"
        region: 'SEQ'
        schema_version: '2024.0.0'

    - name: "appended_corridors"
      storage_type: "local_drive"
      file_type: "parquet"
      relative_path: "3_corridor_explorer/1_append_corridors/{version}/corridor_type={corridor_type}/region={region}/{corridor_id}_tt_estimated.parquet.gz"
      requires_region: true
      default_variables:
        version: "20221014"
        region: 'SEQ'
        schema_version: '2024.0.0'

    - name: "corridors_estimated_link_based_measures"
      storage_type: "local_drive"
      file_type: "parquet"
      relative_path: "3_corridor_explorer/2_measures/{version}/corridor_type={corridor_type}/region={region}/link_based_measures/{corridor_id}_link_based_estimates.parquet.gz"
      requires_region: true
      default_variables:
        version: "20221014"
        region: 'SEQ'
        schema_version: '2024.0.0'

    - name: "corridors_estimated_stop_based_measures"
      storage_type: "local_drive"
      file_type: "parquet"
      relative_path: "3_corridor_explorer/2_measures/{version}/corridor_type={corridor_type}/region={region}/stop_based_measures/{corridor_id}_stop_based_estimates.parquet.gz"
      requires_region: true
      default_variables:
        version: "20221014"
        region: 'SEQ'
        schema_version: '2024.0.0'

    - name: "corridors_bus_bunching_measures"
      storage_type: "local_drive"
      file_type: "parquet"
      relative_path: "3_corridor_explorer/2_measures/{version}/corridor_type={corridor_type}/region={region}/bus_bunching_measures/{corridor_id}_bus_bunching_measures.parquet.gz"
      requires_region: true
      default_variables:
        version: "20221014"
        region: 'SEQ'
        schema_version: '2024.0.0'

    - name: "bcap_segments_measures"
      storage_type: "local_drive"
      file_type: "parquet"
      relative_path: "4_bcap/1_measures/{version}/region={region}/segments_measures.parquet.gz"
      requires_region: true
      default_variables:
        version: "20221014"
        region: 'SEQ'
        schema_version: '2024.0.0'

    - name: "bcap_stops_measures"
      storage_type: "local_drive"
      file_type: "parquet"
      relative_path: "4_bcap/1_measures/{version}/region={region}/stops_measures.parquet.gz"
      requires_region: true
      default_variables:
        version: "20221014"
        region: 'SEQ'
        schema_version: '2024.0.0'


connections:
  - connection_name: "local_postgres"
    host: "localhost"
    port: 5432
    dbname: "sampledb"
    user: "user"
    password: "password"



Calculations Configuration

# This YAML configuration file is used to define operations for calculating aggregate measures from
# the link-based measures dataframe that are already appended to the corridor definitions.


operations:
  - type: calculation
    name: extracting hour
    formula: "hour = scheduled_start.dt.hour"

  - type: custom_transform
    name: dropping null estimated travel time
    formula: "df = df[df['link_actual_travel_time'].notnull()]"

  - type: custom_transform
    name: dropping any possible case of zero travel time
    formula: "df = df[df['link_actual_travel_time']>0]"

  - type: custom_transform
    name: creating service column
    formula: "df.loc[:, 'service'] = df['trip_id'] + '-' + df['scheduled_start'].astype(str)"

  - type: calculation
    name: estimating segment speed (km/hr)
    formula: "actual_speed = seg_length* 3.6/ link_actual_travel_time"
    
  - type: custom_transform
    name: remove outlier speed
    formula: "df = df[(df['actual_speed']<100) & (df['actual_speed']>=5)]"


  - type: aggregation
    name: calculating free flow travel time
    group_by: [ "region",  "hour", "corridor_type",   "corridor_desc", "corridor_id","corridor_direction","corridor_section", "corridor_seg_order", "seg_id", "seg_direction" ]
    aggregations:
      free_flow_travel_time:
        column: link_actual_travel_time
        function: "quantile"
        params:
          q: 0.15
    transform: true

  - type: custom_transform
    name: calculating timetable delay per link (in seconds)
    formula: "calculate_timetable_delay(df)"

  - type: custom_transform
    name: calculating estimated delay per link (in seconds)
    formula: "calculate_link_estimated_delay(df)"

  - type: calculation
    name: timetable delay per km (min per km)
    formula: "timetable_delay_per_km = (link_timetable_delay/60) / (seg_length/1000)"

  - type: calculation
    name: estimated delay per km (min/km)
    formula: "link_estimated_delay_per_km = (link_estimated_delay/60) / (seg_length/1000)"

  - type: aggregation
    name: estimating aggregate measures
    group_by: [ "region", "hour", "corridor_type", "corridor_desc",  "corridor_id","corridor_direction",
                "corridor_section", "corridor_seg_order",  "seg_id", "seg_direction" ]
    aggregations:
      seg_length:
        column: "seg_length"
        function: "first"
      count_date:
        column: "date"
        function: "nunique"
      travel_time_link_avg_hourly:
        column: "link_actual_travel_time"
        function: "non_zero_mean"
      travel_time_link_Median_hourly:
        column: "link_actual_travel_time"
        function: "non_zero_median"
      StdDevNo0_estimated_travel_time:
        column: "link_actual_travel_time"
        function: "non_zero_std"
      CountNonNull_estimated_travel_time:
        column: "link_actual_travel_time"
        function: "non_null_count"
      total_stop_load_per_link_hourly:
        column: "stop_load"
        function: "sum"
      count_services_link_hour:
        column: "service"
        function: "nunique"
      std_stop_load_per_service_hourly:
        column: "stop_load"
        function: "non_zero_std"
      timetable_delay_per_km:
        column: "timetable_delay_per_km"
        function: "median"
      link_estimated_delay_per_km:
        column: "link_estimated_delay_per_km"
        function: "median"


  - type: aggregation
    name: estimating corridor length
    group_by: [ "region", "hour", "corridor_type", "corridor_id", "corridor_direction" ]
    aggregations:
      corridor_length:
        column: seg_length
        function: "sum"
    transform: true

  - type: calculation
    name: convert corridor length to km
    formula: "corridor_length = corridor_length/1000"

  - type: aggregation
    name: estimating corridor length threshold
    group_by: [ "region", "hour", "corridor_type", "corridor_id",  "corridor_direction" ]
    aggregations:
      corridor_length_threshold:
        column: corridor_length
        function: "mode"
    transform: true

  - type: custom_transform
    name: assigning accept_hour
    formula: "assign_accept_hour(df)"

  - type: calculation
    name: estimating segment speed (km/hr)
    formula: "seg_speed = seg_length* 3.6/ travel_time_link_avg_hourly"

  - type: calculation
    name: estimating hourly travel time per km on links (minutes/km)
    formula: "travel_time_per_km_hourly_link = (travel_time_link_avg_hourly/60)/ (seg_length/1000)"

  - type: aggregation
    group_by: [ "region",  "hour", "corridor_type" , "corridor_id", "corridor_direction" ]
    name: estimating aggregate hourly average travel time on corridors
    aggregations:
      travel_time_corridor_avg_hourly:
        column: travel_time_link_avg_hourly
        function: "sum"
    transform: true

  - type: calculation
    name: converting the travel_time_corridor_avg_hourly to minutes
    formula: "travel_time_corridor_avg_hourly = travel_time_corridor_avg_hourly/60"

  - type: calculation
    name: converting the travel_time_link_avg_hourly to minutes
    formula: "travel_time_link_avg_hourly = travel_time_link_avg_hourly/60"

  - type: calculation
    name: converting the travel_time_link_Median_hourly to minutes
    formula: "travel_time_link_Median_hourly = travel_time_link_Median_hourly/60"

  - type: calculation
    name: converting the StdDevNo0_estimated_travel_time to minutes
    formula: "StdDevNo0_estimated_travel_time = StdDevNo0_estimated_travel_time/60"

Stop Measures - Corridor Explorer

# This YAML configuration file is used to define operations for calculating aggregate stop based measures from
# the link-based measures dataframe that are already appended to the corridor definitions.


operations:

  - type: custom_transform
    name: filtering to stops
    formula: "df = df.loc[df['is_stop'] == 1]"

  - type: calculation
    name: extracting hour
    formula: "hour = scheduled_start.dt.hour"

  - type: custom_transform
    name: creating service column
    formula: "df.loc[:, 'service'] = df['trip_id'] + '-' + df['scheduled_start'].astype(str)"


  - type: aggregation
    name: estimating aggregate measures
    group_by: [ "region", "date", "hour", "corridor_id", "corridor_desc", "corridor_direction", "corridor_type", "segment_order", "is_stop","stop_id" ]
    aggregations:
      seg_length:
        column: 'seg_length'
        function: 'first'
      count_services_hourly_per_date:
        column: "service"
        function: "non_null_count_distinct"
      estimated_boarding_stop:
        column: "boarding"
        function: "sum"
      estimated_alighting_stop:
        column: "alighting"
        function: "sum"
      estimated_load:
        column: "stop_load"
        function: "sum"
      count:
        column: 'stop_load'
        function: "size"

  - type: aggregation
    name: estimating corridor length
    group_by: [ "region", "hour", "corridor_type", "corridor_id", "corridor_direction" ]
    aggregations:
      corridor_length:
        column: seg_length
        function: "sum"
    transform: true

  - type: aggregation
    name: estimating corridor length threshold
    group_by: [ "region", "hour", "corridor_type", "corridor_id",  "corridor_direction" ]
    aggregations:
      corridor_length_threshold:
        column: corridor_length
        function: "mode"
    transform: true

  - type: custom_transform
    name: assigning accept_hour
    formula: "assign_accept_hour(df)"

# This YAML configuration file is used to define operations for calculating aggregate measures for BCAP from
# the link-based measures dataframe that are already appended to the corridor definitions.

operations:
  - type: calculation
    name: extracting hour
    formula: "start_hour = scheduled_start.dt.hour"

  - type: custom_transform
    name: data cleaning  - dropping null estimated travel time
    formula: "df = df[df['link_actual_travel_time'].notnull()]"

  - type: custom_transform
    name: data cleaning  - dropping any possible case of zero travel time
    formula: "df = df[df['link_actual_travel_time']>0]"

  - type: custom_transform
    name: data cleaning  - dropping any possible duplications
    formula: "df = df.drop_duplicates()"

  - type: calculation
    name: estimating segment speed (km/hr)
    formula: "actual_speed = seg_length* 3.6/ link_actual_travel_time"

  - type: custom_transform
    name: remove outlier speed
    formula: "df = df[(df['actual_speed']<100) & (df['actual_speed']>=0)]"

  - type: custom_transform
    name: assign time period
    formula: "assign_time_period(df)"

  - type: aggregation
    group_by: [ "seg_id", "seg_direction" ]
    aggregations:
      q1:
        column: "link_actual_travel_time"
        function: "quantile"
        params:
          q: 0.25
    transform: true

  - type: aggregation
    group_by: [ "seg_id", "seg_direction" ]
    aggregations:
      q3:
        column: "link_actual_travel_time"
        function: "quantile"
        params:
          q: 0.75
    transform: true

  - type: calculation
    name: calculate_upper_outlier_bound
    formula: "upper_outlier = 1.5 * (q3 - q1) + q3"

  - type: calculation
    name: calculate_lower_outlier_bound
    formula: "lower_outlier = q1 - 1.5 * (q3 - q1)"


  - type: custom_transform
    name: assign_outlier_flag
    formula: "assign_outlier_flag(df)"

  - type: custom_transform
    name: remove outlier travel time
    formula: "df = df[df['outlier_flag']==0]"

  - type: calculation
    name: estimating actual travel time in minutes per km
    formula: "actual_travel_time_per_km = link_actual_travel_time/ (seg_length*0.06)"

  - type: aggregation
    group_by: [ 'seg_id','seg_direction' ]
    name: estimating 95 percentile of actual speed
    aggregations:
      actual_speed_q95_all:
        column: actual_speed
        function: "quantile"
        params:
          q: 0.95
    transform: true

  - type: aggregation
    group_by: [ 'seg_id','seg_direction' ]
    name: concatenating list of route-short-names
    aggregations:
      route_short_name:
        column: route_short_name
        function: "list_of_unique_strings"
    transform: true

  - type: aggregation
    name: getting unique date
    group_by: [ 'region' ]
    aggregations:
      date_count:
        column: 'date'
        function: 'non_null_count_distinct'
    transform: true

  - type: aggregation
    name: getting count of segments with speed below 20 km/hr
    group_by: [ 'seg_id','seg_direction' ]
    aggregations:
      actual_speed_below20:
        column: "actual_speed"
        function: "count_below_threshold"
        params:
          threshold: 20
    transform: true


  - type: aggregation
    name: getting count of segments with speed above 0 km/hr
    group_by: [ 'seg_id','seg_direction' ]
    aggregations:
      actual_speed_above0:
        column: "actual_speed"
        function: "count_above_threshold"
        params:
          threshold: 0
    transform: true

  - type: calculation
    name: calculate_Total_speed_below_20
    formula: "Total_speed_below_20 = 100*actual_speed_below20/actual_speed_above0"

  - type: aggregation
    name: estimating aggregate measures
    group_by: [ 'region', 'seg_id','seg_direction','time_period' ]
    aggregations:
      seg_length:
        column: "seg_length"
        function: "first"
      date_count:
        column: "date_count"
        function: "first"
      Total_speed_below_20:
        column: "Total_speed_below_20"
        function: "first"
      actual_travel_time_min:
        column: "link_actual_travel_time"
        function: "min"
      actual_travel_time_max:
        column: "link_actual_travel_time"
        function: "max"
      actual_travel_time_mean:
        column: "link_actual_travel_time"
        function: "mean"
      actual_travel_time_count:
        column: "link_actual_travel_time"
        function: "count"
      actual_travel_time_median:
        column: "link_actual_travel_time"
        function: "median"
      actual_travel_time_std:
        column: "link_actual_travel_time"
        function: "non_zero_std"
      actual_travel_time_q5:
        column: "link_actual_travel_time"
        function: "quantile"
        params:
          q: 0.5
      actual_travel_time_q95:
        column: "link_actual_travel_time"
        function: "quantile"
        params:
          q: 0.95
      actual_travel_time_per_km_mean:
        column: "actual_travel_time_per_km"
        function: "mean"
      actual_travel_time_per_km_std:
        column: "actual_travel_time_per_km"
        function: "std"
      actual_speed_min:
        column: "actual_speed"
        function: "min"
      actual_speed_max:
        column: "actual_speed"
        function: "max"
      actual_speed_mean:
        column: "actual_speed"
        function: "mean"
      actual_speed_median:
        column: "actual_speed"
        function: "median"
      actual_speed_std:
        column: "actual_speed"
        function: "non_zero_std"
      actual_speed_q5:
        column: "actual_speed"
        function: "quantile"
        params:
          q: 0.5
      actual_speed_q95:
        column: "actual_speed"
        function: "quantile"
        params:
          q: 0.95
      actual_speed_below15:
        column: "actual_speed"
        function: "count_below_threshold"
        params:
          threshold: 15
      actual_speed_below20:
        column: "actual_speed"
        function: "count_below_threshold"
        params:
          threshold: 20
      departing_passengers_sum:
        column: "stop_load"
        function: "sum"
      route_short_name:
        column: "route_short_name"
        function: 'first'
      actual_speed_q95_all:
        column: "actual_speed_q95_all"
        function: 'first'

  - type: calculation
    name: SpeedBelow20
    formula: "SpeedBelow20 = 100* actual_speed_below20/ actual_travel_time_count"

  - type: calculation
    name: JTVariability # does not matter what is the unit of travel time per km. It can be second or hour
    formula: "JTVariability = actual_travel_time_per_km_std/ actual_travel_time_per_km_mean"


  - type: calculation
    name: CongestionIndex
    formula: "CongestionIndex = actual_speed_q95_all/ actual_speed_mean"

  - type: custom_transform
    name: ExcessPassDelay,
    formula: "df['ExcessPassDelay'] = (((df['actual_travel_time_q95'] - df['actual_travel_time_mean'])/60)* (df['departing_passengers_sum'] / df['date_count']))/(df['seg_length']/1000)"

  - type: custom_transform
    name: set negative ExcessPassDelay to zero,
    formula: "df.loc[df['ExcessPassDelay']<0,['ExcessPassDelay'] ] =0"

  - type: calculation
    name: AvePassDelay # this is actually the AveVehicleDelay min per km
    formula: "AvePassDelay = ((actual_travel_time_mean - actual_travel_time_q5)/60)/ (seg_length/1000)"

  - type: custom_transform
    name: set negative AvePassDelay to zero,
    formula: "df.loc[df['AvePassDelay']<0,['AvePassDelay'] ] =0"

  - type: custom_transform
    name: TotalPassDelay
    formula: "df['TotalPassDelay'] = df['AvePassDelay']* (df['departing_passengers_sum'] / df['date_count'])"
  #    formula: "df['TotalPassDelay'] = (((df['actual_travel_time_mean'] - df['actual_travel_time_q5'])/60)* (df['departing_passengers_sum'] / df['date_count']))/(df['seg_length']/1000)"

  - type: custom_transform
    name: set negative TotalPassDelay to zero,
    formula: "df.loc[df['TotalPassDelay']<0,['TotalPassDelay'] ] =0"

  - type: custom_transform
    name: data cleaning  - dropping empty records
    formula: "df = df[df['actual_travel_time_count']>0]"

  - type: custom_transform
    name: reformatting the outputs
    formula: "df = (df.set_index(['seg_id','seg_direction','seg_length','route_short_name','Total_speed_below_20','time_period']).unstack().reset_index())"
  
  - type: custom_transform
    name: renaming columns
    formula: "df.columns = ['_'.join(col).rstrip('-') for col in df.columns ]"

  - type: custom_transform
    name: renaming columns
    formula: "df.columns = [col.rstrip('_') for col in df.columns]"

  - type: custom_transform
    name: getting the date count
    formula: "df['date_count'] = df[['date_count_am','date_count_op','date_count_pm']].max(axis=1, skipna=True)"

  - type: custom_transform
    name: fill_na_required fields
    formula: "df[[f'{c}_{tp}' for c in ['actual_travel_time_count','departing_passengers_sum' ,'ExcessPassDelay', 'TotalPassDelay', 'actual_speed_below20'] for tp in ['am', 'op', 'pm']]] = df[[f'{c}_{tp}' for c in ['actual_travel_time_count', 'departing_passengers_sum' ,'ExcessPassDelay', 'TotalPassDelay', 'actual_speed_below20'] for tp in ['am', 'op', 'pm']]].fillna(0)"

  - type: custom_transform
    name: convert data types of departing_passengers_sum to int
    formula: "df[[f'departing_passengers_sum_{tp}' for tp in ['am', 'op', 'pm']]] = df[[f'departing_passengers_sum_{tp}' for tp in ['am', 'op', 'pm']]].astype('int')"

  - type: calculation
    name: Bus_count_total_daily
    formula: "Bus_count_total_daily = (actual_travel_time_count_am + actual_travel_time_count_op + actual_travel_time_count_pm) / date_count"

  - type: calculation
    name: Passengers_total_daily
    formula: "Passengers_total_daily = (departing_passengers_sum_am + departing_passengers_sum_op + departing_passengers_sum_pm) / date_count"

  - type: calculation
    name: Excess_passenger_delay_daily
    formula: "Excess_passenger_delay_daily = (ExcessPassDelay_am + ExcessPassDelay_op + ExcessPassDelay_pm)"

  - type: calculation
    name: Total_passenger_delay_daily
    formula: "Total_passenger_delay_daily = (TotalPassDelay_am + TotalPassDelay_op + TotalPassDelay_pm)"

  - type: custom_transform
    name: create unique seg identifier
    formula: "df['seg_unique'] = df['seg_id'] +'-' + df['seg_direction'] "

  - type: custom_transform
    name: assign JTV_flag
    formula: "assign_flag(df, 'JTVariability')"

  - type: custom_transform
    name: assign CI_flag
    formula: "assign_flag(df, 'CongestionIndex')"

  - type: custom_transform
    name: assign SpeedBelow20 flag
    formula: "assign_flag(df, 'SpeedBelow20')"

  - type: custom_transform
    name: assign ExcessPassDelay flag
    formula: "assign_flag(df, 'ExcessPassDelay')"

  - type: custom_transform
    name: assign TotalPassDelay flag
    formula: "assign_flag(df, 'TotalPassDelay')"

  - type: custom_transform
    name: assign AvePassDelay flag
    formula: "assign_flag(df, 'AvePassDelay')"

Data Schemas

Raw Transactions Schema

metadata:
  author: MT
  last_modified: '2023-10-12'
  notes: raw schema does not have boarding_stop_id and alighting_stop_id fields instead they are part of the boarding and alighting_stop fields.
schema_type: dataframe
strict: false

columns:
  actual_end:
    dtype: "object"
    nullable: true
    required: true
  actual_start:
    dtype: "object"
    coerce: true
    nullable: true
    required: true
  alighting_stop:
    dtype: "object"
    nullable: true
    required: true
    correction: "extract_stop_id(x)"
    new_column: alighting_stop_id  # creating a new column from boarding stop using the above correction.
  alighting_time:
    dtype: "object"
    nullable: true
    required: true
    coerce: true
  boarding_stop:
    dtype: "object"
    nullable: true
    required: true
    correction: "extract_stop_id(x)"
    new_column: boarding_stop_id  # creating a new column from boarding stop using the above correction.
  boarding_time:
    dtype: "object"
    nullable: true
    required: true
    coerce: true
  direction:
    dtype: "object"
    nullable: true
    required: true
  operator:
    dtype: "object"
    nullable: true
    required: true
  passenger_type_level_2:
    dtype: "object"
    nullable: true
    required: true
  passengers:
    dtype: "object"
    nullable: true
    coerce: true
    required: true
  ticket_status:
    dtype: "object"
    nullable: true
    coerce: true
    required: true
  period_date:
    dtype: "datetime64[ns]"
    format: '%Y-%m-%d'
    nullable: true
    required: true
    coerce: true
  route:
    dtype: "object"
    nullable: true
    required: true
  scheduled_start:
    dtype: "object"
    nullable: true
    required: true
    coerce: true
  service:
    dtype: "object"
    nullable: true
    required: true
  ticket_type_level_3:
    dtype: "object"
    required: true
  vehicle:
    dtype: "object"
    required: true

Transaction Schema

# Transaction Data Schema
#
# This schema is used to validate and preprocess the transaction data before further analysis.
# Below are the specifications for each column in the data, including their data types,
# whether they are required and any corrections that should be applied upon reading.
#
# YAML Schema is using strict=false This will allow Pandera to validate only
# the columns defined in the schema and ignore any extra columns in the data.
#


metadata:
  author: MT
  last_modified: '2023-10-12'
  notes: Initial schema with corrections for boarding_stop_id and alighting_stop_id.
schema_type: dataframe
strict: false

columns:
  actual_end:
    dtype: "datetime64[ns]"
    format: '%Y-%m-%d %H:%M:%S'
    nullable: true
    required: true
    coerce: true
  actual_start:
    definition: this is the tap on time for the transaction
    dtype: "datetime64[ns]"
    format: '%Y-%m-%d %H:%M:%S'
    note: actual start is needed to calculate the travel time
    coerce: true
    nullable: true
    required: true
  alighting_stop:
    dtype: "object"
    nullable: true
    required: true
  alighting_stop_id:
    correction: "str(x).rstrip('.0')"
    dtype: "object"
    required: true
  alighting_time:
    dtype: "datetime64[ns]"
    format: '%Y-%m-%d %H:%M:%S'
    nullable: true
    required: true
    coerce: true
  boarding_stop:
    dtype: "object"
    nullable: true
    required: true
  boarding_stop_id:
    correction: "str(x).rstrip('.0')"
    dtype: "object"
    required: true
  boarding_time:
    dtype: "datetime64[ns]"
    format: '%Y-%m-%d %H:%M:%S'
    nullable: true
    required: true
    coerce: true
  direction:
    dtype: "object"
    nullable: true
    required: true
  operator:
    dtype: "object"
    nullable: true
    required: true
  passenger_type_level_2:
    dtype: "object"
    nullable: true
    required: true
  passengers:
    dtype: 'Int64'
    correction: "0 if pd.isna(x) else float(x)"
    nullable: true
    coerce: true
    required: true
  period_date:
    dtype: "datetime64[ns]"
    format: '%Y-%m-%d'
    nullable: true
    required: true
    coerce: true
  route:
    dtype: "object"
    nullable: true
    required: true
  scheduled_start:
    dtype: "datetime64[ns]"
    format: '%Y-%m-%d %H:%M:%S'
    nullable: true
    required: true
    coerce: true
  service:
    dtype: "object"
    nullable: true
    required: true
  ticket_status:
    dtype: "object"
    nullable: true
    required: true
  ticket_type_level_3:
    nullable: true
    dtype: "object"
    required: true
  vehicle:
    nullable: true
    dtype: "object"
    required: true




Raw Itinerary


# HASTUS Itinerary Schema
#
# In 2024 extract the Itinerary files provided has column headers (in all the previous versions there was no column names). These column headers should be renamed and adjusted.

columns:
  tpat_route_id:
    nullable: false
    required: true
    rename_to: 'shape_id'
    note: To estimate link-based travel time, all shape IDs in the analysis period must be found in the itinerary table of the input GTFS. If a shape ID is missing and it corresponds to an unimportant road for the analysis, it's okay.
  itin_tpat_counter:
    dtype: int64
    coerce: true
    required: true
    nullable: false
    rename_to: 'segment'
    note: The segment is the distinct stop to stop portion of a trip.
  itin_seg_counter:
    dtype: int64
    coerce: true
    required: true
    nullable: false
    rename_to: 'order'
    note: Order of the links between each segments.
  itin_seg_direction:
    required: true
    nullable: false
    coerce: true
    dtype: category
    enums: [ 'To Destination', 'To Origin', 'To ' ]
    rename_to: 'seg_direction'
    note: This is the direction in which the spatial line is drawn (as per the specification of the StreetSegmentNetwork) and is nothing to do with the direction of travel or the direction of trip.
  itin_seg_external_id:
    required: true
    nullable: true
    rename_to: 'seg_id'
    note: This is the seg_id in the StreetSegmentNetwork. This ID is not unique link id unless it is combined with the direction field. There are records in the itinerary file that has no seg_id. A unique combination of seg_id and seg_direction appears multiple times in this dataset, as different shape ids are using the same link.