ssscoring.flysight
Functions and logic for detecting, validating and manipulating FlySight CSV files, including detection in the file system. The functions in this module assume that a data lake exists somewhere in the file system (whether local or cloud-based).
1# See: https://github.com/pr3d4t0r/SSScoring/blob/master/LICENSE.txt 2 3""" 4Functions and logic for detecting, validating and manipulating 5FlySight CSV files, including detection in the file system. The functions in 6this module assume that a data lake exists somewhere in the file system (whether 7local or cloud-based). 8""" 9 10 11from collections import OrderedDict 12from io import StringIO 13from pathlib import Path 14 15from ssscoring.constants import FLYSIGHT_1_HEADER 16from ssscoring.constants import FLYSIGHT_2_HEADER 17from ssscoring.constants import FLYSIGHT_FILE_ENCODING 18from ssscoring.constants import IGNORE_LIST 19from ssscoring.constants import MIN_JUMP_FILE_SIZE 20from ssscoring.datatypes import FlySightVersion 21from ssscoring.errors import SSScoringError 22 23import csv 24import os 25import shutil 26import tempfile 27 28import pandas as pd 29 30 31# +++ functions +++ 32 33def isCRMangledCSV(fileThing) -> bool: 34 """ 35 Tests if `fileThing` is an Excel or Dropbox DOS file with lines terminated 36 in CRCRLF. These occur when someone opens the file with Excel or some other 37 tool in a Windows system and saves the file back to the file system, 38 mangling the original format. 39 40 Arguments 41 --------- 42 fileThing 43 A string or `pathlib.Path` object associated with what looks like a FlySight 44 CR mangled file. 45 46 Returns 47 ------- 48 `True` if the file has one or more lines ending in CRCRLF within the first 49 512 bytes of data. 50 """ 51 with open (fileThing, 'rb') as file: 52 rawData = file.read() 53 return b'\r\r\n' in rawData 54 55 56def fixCRMangledCSV(fileThing): 57 """ 58 Open the file associated with `fileThing` and repleace all`\r\r\b` with 59 `\r\n` EOL markers. 60 61 Arguments 62 --------- 63 fileThing 64 A string or `pathlib.Path` object associated with what looks like a FlySight 65 CR mangled file. 66 67 See 68 --- 69 `ssscoring.flysight.isCRMangledCSV` 70 """ 71 with open(fileThing, 'rb') as inputFile: 72 fileContents = inputFile.read() 73 fileContents = fileContents.replace(b'\r\r\n', b'\r\n') 74 with tempfile.NamedTemporaryFile(delete = False) as outputFile: 75 outputFile.write(fileContents) 76 tempFileName = outputFile.name 77 shutil.copy(tempFileName, fileThing) 78 os.unlink(tempFileName) 79 80 81def skipOverFS2MetadataRowsIn(data: pd.DataFrame) -> pd.DataFrame: 82 """ 83 Returns a clean dataframe on which any metadata rows within the first 100 84 are skipped. This function uses the `time` column to detect valid rows. A 85 `time == NaN` is considered invalid and skipped. 86 87 Arguments 88 --------- 89 data 90 A FlySight 2 dataframe suspected of having dirty N first rows with metadata 91 92 Returns 93 ------- 94 A FlySight 2 clean dataframe without any leading metadata rows. 95 """ 96 for ref in range(0,100): 97 if pd.notnull(data.iloc[ref].time): 98 break 99 return data.iloc[ref:] 100 101 102def validFlySightHeaderIn(fileThingCSV) -> bool: 103 """ 104 Checks if a file is a CSV in FlySight 1 or FlySight 2 formats. The checks 105 include: 106 107 - Whether the file is a CSV, using a comma delimiter 108 - Checks for the presence of all the documented FlySight 1 headers 109 - Checks for the presence of the FlySight 2 line 1 identifier 110 111 Arguments 112 --------- 113 fileThingCSV 114 A file thing to verify as a valid FlySight file; can be a string, an 115 instance of `libpath.Path`, or a buffer of `bytes`. 116 117 Returns 118 ------- 119 `True` if `fileThingCSV` is a FlySight CSV file, otherwise `False`. 120 """ 121 delimiters = [',', ] 122 hasAllHeaders = False 123 stream = None 124 if isinstance(fileThingCSV, bytes): 125 stream = StringIO(fileThingCSV.decode(FLYSIGHT_FILE_ENCODING)) 126 else: 127 stream = open(fileThingCSV, 'r') 128 129 try: 130 dialect = csv.Sniffer().sniff(stream.readline(), delimiters = delimiters) 131 except: 132 return False 133 if dialect.delimiter in delimiters: 134 stream.seek(0) 135 header = next(csv.reader(stream)) 136 else: 137 return False 138 hasAllHeaders = True if header[0] == '$FLYS' else FLYSIGHT_1_HEADER.issubset(header) 139 return hasAllHeaders 140 141 142def getAllSpeedJumpFilesFrom(dataLake: Path) -> dict: 143 """ 144 Get a list of all the speed jump files from a data lake, where data lake is 145 defined as a reachable path that contains one or more FlySight CSV files. 146 This function tests each file to ensure that it's a speed skydive FlySight 147 file in a valid format and length. It doesn't validate data like versions 148 prior to 1.9.0. 149 150 Arguments 151 --------- 152 dataLake: str 153 A valid (absolute or relative) path name to the top level directory where 154 the data lake starts. 155 156 Returns 157 ------- 158 A dictionary of speed jump file names for later SSScoring processing: 159 - keys are the file names 160 - values are a FlySight version string tag 161 """ 162 jumpFiles = OrderedDict() 163 for root, dirs, files in os.walk(dataLake): 164 if any(name in root for name in IGNORE_LIST): 165 continue 166 for fileName in files: 167 data = None 168 if '.swp' in fileName: # Ignore Vim, other editors swap file 169 continue 170 if '.CSV' in fileName.upper(): 171 version = '1' 172 jumpFileName = Path(root) / fileName 173 stat = os.stat(jumpFileName) 174 if all(x not in fileName for x in ('EVENT', 'SENSOR', 'TRACK')): 175 # FlySight 1 track format 176 data = pd.read_csv(jumpFileName, skiprows = (1, 1), index_col = False) 177 elif 'TRACK' in fileName: 178 # FlySight 2 track custom format 179 data = pd.read_csv(jumpFileName, names = FLYSIGHT_2_HEADER, skiprows = 6, index_col = False, na_values = ['NA', ], dtype={ 'hMSL': float, }) 180 data = skipOverFS2MetadataRowsIn(data) 181 data.drop('GNSS', inplace = True, axis = 1) 182 version = '2' 183 if data is not None and stat.st_size >= MIN_JUMP_FILE_SIZE and validFlySightHeaderIn(jumpFileName): 184 # explicit because `not data` is ambiguous for dataframes 185 jumpFiles[jumpFileName] = version 186 jumpFiles = OrderedDict(sorted(jumpFiles.items())) 187 return jumpFiles 188 189 190def detectFlySightFileVersionOf(fileThing) -> FlySightVersion: 191 """ 192 Detects the FlySight file version based on its file name and format. 193 194 Arguments 195 --------- 196 fileThing 197 A string, `bytes` buffer or `pathlib.Path` object corresponding to track 198 file. If string or `pathlib.Path`, it'll be treated as a file. 199 200 Returns 201 ------- 202 An instance of `ssscoring.flysight.FlySightVersion` with a valid version 203 symbolic value. 204 205 Errors 206 ------ 207 `ssscoring.errors.SSScoringError` if the file is not a CSV and it's some 208 other invalid format. 209 """ 210 if isinstance(fileThing, Path): 211 fileName = fileThing.as_posix() 212 elif isinstance(fileThing, str): 213 fileName = fileThing 214 fileThing = Path(fileThing) 215 elif isinstance(fileThing, bytes): 216 fileName = '00-00-00.CSV' 217 218 delimiters = [',', ] 219 stream = None 220 if not '.CSV' in fileName.upper(): 221 raise SSScoringError('Invalid file extension type') 222 if any(x in fileName for x in ('EVENT.CSV', 'SENSOR.CSV')): 223 raise SSScoringError('Only TRACK.CSV v2 files can be processed at this time') 224 if isinstance(fileThing, Path) or isinstance(fileThing, str): 225 if not fileThing.is_file(): 226 raise SSScoringError('%s - file not found in data lake' % fileName) 227 if not validFlySightHeaderIn(fileName): 228 raise SSScoringError('CSV is not a valid FlySight file') 229 stream = open(fileName, 'r') 230 elif isinstance(fileThing, bytes): 231 stream = StringIO(fileThing.decode(FLYSIGHT_FILE_ENCODING)) 232 233 try: 234 dialect = csv.Sniffer().sniff(stream.readline(), delimiters = delimiters) 235 except: 236 raise SSScoringError('Error while trying to validate %s file format' % fileName) 237 if dialect.delimiter in delimiters: 238 stream.seek(0) 239 header = next(csv.reader(stream)) 240 else: 241 raise SSScoringError('CSV uses a different delimiter from FlySigh') 242 if header[0] == '$FLYS': 243 return FlySightVersion.V2 244 elif FLYSIGHT_1_HEADER.issubset(header): 245 return FlySightVersion.V1 246 else: 247 raise SSScoringError('%s file is not a FlySight v1 or v2 file')
34def isCRMangledCSV(fileThing) -> bool: 35 """ 36 Tests if `fileThing` is an Excel or Dropbox DOS file with lines terminated 37 in CRCRLF. These occur when someone opens the file with Excel or some other 38 tool in a Windows system and saves the file back to the file system, 39 mangling the original format. 40 41 Arguments 42 --------- 43 fileThing 44 A string or `pathlib.Path` object associated with what looks like a FlySight 45 CR mangled file. 46 47 Returns 48 ------- 49 `True` if the file has one or more lines ending in CRCRLF within the first 50 512 bytes of data. 51 """ 52 with open (fileThing, 'rb') as file: 53 rawData = file.read() 54 return b'\r\r\n' in rawData
Tests if fileThing
is an Excel or Dropbox DOS file with lines terminated
in CRCRLF. These occur when someone opens the file with Excel or some other
tool in a Windows system and saves the file back to the file system,
mangling the original format.
Arguments
fileThing
A string or pathlib.Path
object associated with what looks like a FlySight
CR mangled file.
Returns
True
if the file has one or more lines ending in CRCRLF within the first
512 bytes of data.
57def fixCRMangledCSV(fileThing): 58 """ 59 Open the file associated with `fileThing` and repleace all`\r\r\b` with 60 `\r\n` EOL markers. 61 62 Arguments 63 --------- 64 fileThing 65 A string or `pathlib.Path` object associated with what looks like a FlySight 66 CR mangled file. 67 68 See 69 --- 70 `ssscoring.flysight.isCRMangledCSV` 71 """ 72 with open(fileThing, 'rb') as inputFile: 73 fileContents = inputFile.read() 74 fileContents = fileContents.replace(b'\r\r\n', b'\r\n') 75 with tempfile.NamedTemporaryFile(delete = False) as outputFile: 76 outputFile.write(fileContents) 77 tempFileName = outputFile.name 78 shutil.copy(tempFileName, fileThing) 79 os.unlink(tempFileName)
Open the file associated with fileThing
and repleace all`
with
` EOL markers.
Arguments
---------
fileThing
A string or `pathlib.Path` object associated with what looks like a FlySight
CR mangled file.
See
---
`ssscoring.flysight.isCRMangledCSV`
82def skipOverFS2MetadataRowsIn(data: pd.DataFrame) -> pd.DataFrame: 83 """ 84 Returns a clean dataframe on which any metadata rows within the first 100 85 are skipped. This function uses the `time` column to detect valid rows. A 86 `time == NaN` is considered invalid and skipped. 87 88 Arguments 89 --------- 90 data 91 A FlySight 2 dataframe suspected of having dirty N first rows with metadata 92 93 Returns 94 ------- 95 A FlySight 2 clean dataframe without any leading metadata rows. 96 """ 97 for ref in range(0,100): 98 if pd.notnull(data.iloc[ref].time): 99 break 100 return data.iloc[ref:]
Returns a clean dataframe on which any metadata rows within the first 100
are skipped. This function uses the time
column to detect valid rows. A
time == NaN
is considered invalid and skipped.
Arguments
data
A FlySight 2 dataframe suspected of having dirty N first rows with metadata
Returns
A FlySight 2 clean dataframe without any leading metadata rows.
103def validFlySightHeaderIn(fileThingCSV) -> bool: 104 """ 105 Checks if a file is a CSV in FlySight 1 or FlySight 2 formats. The checks 106 include: 107 108 - Whether the file is a CSV, using a comma delimiter 109 - Checks for the presence of all the documented FlySight 1 headers 110 - Checks for the presence of the FlySight 2 line 1 identifier 111 112 Arguments 113 --------- 114 fileThingCSV 115 A file thing to verify as a valid FlySight file; can be a string, an 116 instance of `libpath.Path`, or a buffer of `bytes`. 117 118 Returns 119 ------- 120 `True` if `fileThingCSV` is a FlySight CSV file, otherwise `False`. 121 """ 122 delimiters = [',', ] 123 hasAllHeaders = False 124 stream = None 125 if isinstance(fileThingCSV, bytes): 126 stream = StringIO(fileThingCSV.decode(FLYSIGHT_FILE_ENCODING)) 127 else: 128 stream = open(fileThingCSV, 'r') 129 130 try: 131 dialect = csv.Sniffer().sniff(stream.readline(), delimiters = delimiters) 132 except: 133 return False 134 if dialect.delimiter in delimiters: 135 stream.seek(0) 136 header = next(csv.reader(stream)) 137 else: 138 return False 139 hasAllHeaders = True if header[0] == '$FLYS' else FLYSIGHT_1_HEADER.issubset(header) 140 return hasAllHeaders
Checks if a file is a CSV in FlySight 1 or FlySight 2 formats. The checks include:
- Whether the file is a CSV, using a comma delimiter
- Checks for the presence of all the documented FlySight 1 headers
- Checks for the presence of the FlySight 2 line 1 identifier
Arguments
fileThingCSV
A file thing to verify as a valid FlySight file; can be a string, an
instance of libpath.Path
, or a buffer of bytes
.
Returns
True
if fileThingCSV
is a FlySight CSV file, otherwise False
.
143def getAllSpeedJumpFilesFrom(dataLake: Path) -> dict: 144 """ 145 Get a list of all the speed jump files from a data lake, where data lake is 146 defined as a reachable path that contains one or more FlySight CSV files. 147 This function tests each file to ensure that it's a speed skydive FlySight 148 file in a valid format and length. It doesn't validate data like versions 149 prior to 1.9.0. 150 151 Arguments 152 --------- 153 dataLake: str 154 A valid (absolute or relative) path name to the top level directory where 155 the data lake starts. 156 157 Returns 158 ------- 159 A dictionary of speed jump file names for later SSScoring processing: 160 - keys are the file names 161 - values are a FlySight version string tag 162 """ 163 jumpFiles = OrderedDict() 164 for root, dirs, files in os.walk(dataLake): 165 if any(name in root for name in IGNORE_LIST): 166 continue 167 for fileName in files: 168 data = None 169 if '.swp' in fileName: # Ignore Vim, other editors swap file 170 continue 171 if '.CSV' in fileName.upper(): 172 version = '1' 173 jumpFileName = Path(root) / fileName 174 stat = os.stat(jumpFileName) 175 if all(x not in fileName for x in ('EVENT', 'SENSOR', 'TRACK')): 176 # FlySight 1 track format 177 data = pd.read_csv(jumpFileName, skiprows = (1, 1), index_col = False) 178 elif 'TRACK' in fileName: 179 # FlySight 2 track custom format 180 data = pd.read_csv(jumpFileName, names = FLYSIGHT_2_HEADER, skiprows = 6, index_col = False, na_values = ['NA', ], dtype={ 'hMSL': float, }) 181 data = skipOverFS2MetadataRowsIn(data) 182 data.drop('GNSS', inplace = True, axis = 1) 183 version = '2' 184 if data is not None and stat.st_size >= MIN_JUMP_FILE_SIZE and validFlySightHeaderIn(jumpFileName): 185 # explicit because `not data` is ambiguous for dataframes 186 jumpFiles[jumpFileName] = version 187 jumpFiles = OrderedDict(sorted(jumpFiles.items())) 188 return jumpFiles
Get a list of all the speed jump files from a data lake, where data lake is defined as a reachable path that contains one or more FlySight CSV files. This function tests each file to ensure that it's a speed skydive FlySight file in a valid format and length. It doesn't validate data like versions prior to 1.9.0.
Arguments
dataLake: str
A valid (absolute or relative) path name to the top level directory where the data lake starts.
Returns
A dictionary of speed jump file names for later SSScoring processing: - keys are the file names - values are a FlySight version string tag
191def detectFlySightFileVersionOf(fileThing) -> FlySightVersion: 192 """ 193 Detects the FlySight file version based on its file name and format. 194 195 Arguments 196 --------- 197 fileThing 198 A string, `bytes` buffer or `pathlib.Path` object corresponding to track 199 file. If string or `pathlib.Path`, it'll be treated as a file. 200 201 Returns 202 ------- 203 An instance of `ssscoring.flysight.FlySightVersion` with a valid version 204 symbolic value. 205 206 Errors 207 ------ 208 `ssscoring.errors.SSScoringError` if the file is not a CSV and it's some 209 other invalid format. 210 """ 211 if isinstance(fileThing, Path): 212 fileName = fileThing.as_posix() 213 elif isinstance(fileThing, str): 214 fileName = fileThing 215 fileThing = Path(fileThing) 216 elif isinstance(fileThing, bytes): 217 fileName = '00-00-00.CSV' 218 219 delimiters = [',', ] 220 stream = None 221 if not '.CSV' in fileName.upper(): 222 raise SSScoringError('Invalid file extension type') 223 if any(x in fileName for x in ('EVENT.CSV', 'SENSOR.CSV')): 224 raise SSScoringError('Only TRACK.CSV v2 files can be processed at this time') 225 if isinstance(fileThing, Path) or isinstance(fileThing, str): 226 if not fileThing.is_file(): 227 raise SSScoringError('%s - file not found in data lake' % fileName) 228 if not validFlySightHeaderIn(fileName): 229 raise SSScoringError('CSV is not a valid FlySight file') 230 stream = open(fileName, 'r') 231 elif isinstance(fileThing, bytes): 232 stream = StringIO(fileThing.decode(FLYSIGHT_FILE_ENCODING)) 233 234 try: 235 dialect = csv.Sniffer().sniff(stream.readline(), delimiters = delimiters) 236 except: 237 raise SSScoringError('Error while trying to validate %s file format' % fileName) 238 if dialect.delimiter in delimiters: 239 stream.seek(0) 240 header = next(csv.reader(stream)) 241 else: 242 raise SSScoringError('CSV uses a different delimiter from FlySigh') 243 if header[0] == '$FLYS': 244 return FlySightVersion.V2 245 elif FLYSIGHT_1_HEADER.issubset(header): 246 return FlySightVersion.V1 247 else: 248 raise SSScoringError('%s file is not a FlySight v1 or v2 file')
Detects the FlySight file version based on its file name and format.
Arguments
fileThing
A string, bytes
buffer or pathlib.Path
object corresponding to track
file. If string or pathlib.Path
, it'll be treated as a file.
Returns
An instance of ssscoring.flysight.FlySightVersion
with a valid version
symbolic value.
Errors
ssscoring.errors.SSScoringError
if the file is not a CSV and it's some
other invalid format.