ssscoring.flysight

Functions and logic for detecting, validating and manipulating FlySight CSV files, including detection in the file system. The functions in this module assume that a data lake exists somewhere in the file system (whether local or cloud-based).

  1# See: https://github.com/pr3d4t0r/SSScoring/blob/master/LICENSE.txt
  2
  3"""
  4Functions and logic for detecting, validating and manipulating
  5FlySight CSV files, including detection in the file system.  The functions in
  6this module assume that a data lake exists somewhere in the file system (whether
  7local or cloud-based).
  8"""
  9
 10
 11from collections import OrderedDict
 12from io import StringIO
 13from pathlib import Path
 14
 15from ssscoring.constants import FLYSIGHT_1_HEADER
 16from ssscoring.constants import FLYSIGHT_2_HEADER
 17from ssscoring.constants import FLYSIGHT_FILE_ENCODING
 18from ssscoring.constants import IGNORE_LIST
 19from ssscoring.constants import MIN_JUMP_FILE_SIZE
 20from ssscoring.datatypes import FlySightVersion
 21from ssscoring.errors import SSScoringError
 22
 23import csv
 24import os
 25import shutil
 26import tempfile
 27
 28import pandas as pd
 29
 30
 31# +++ functions +++
 32
 33def isCRMangledCSV(fileThing) -> bool:
 34    """
 35    Tests if `fileThing` is an Excel or Dropbox DOS file with lines terminated
 36    in CRCRLF.  These occur when someone opens the file with Excel or some other
 37    tool in a Windows system and saves the file back to the file system,
 38    mangling the original format.
 39
 40    Arguments
 41    ---------
 42        fileThing
 43    A string or `pathlib.Path` object associated with what looks like a FlySight
 44    CR mangled file.
 45
 46    Returns
 47    -------
 48    `True` if the file has one or more lines ending in CRCRLF within the first
 49    512 bytes of data.
 50    """
 51    with open (fileThing, 'rb') as file:
 52        rawData = file.read()
 53        return b'\r\r\n' in rawData
 54
 55
 56def fixCRMangledCSV(fileThing):
 57    """
 58    Open the file associated with `fileThing` and repleace all`\r\r\b` with
 59    `\r\n` EOL markers.
 60
 61    Arguments
 62    ---------
 63        fileThing
 64    A string or `pathlib.Path` object associated with what looks like a FlySight
 65    CR mangled file.
 66
 67    See
 68    ---
 69    `ssscoring.flysight.isCRMangledCSV`
 70    """
 71    with open(fileThing, 'rb') as inputFile:
 72        fileContents = inputFile.read()
 73    fileContents = fileContents.replace(b'\r\r\n', b'\r\n')
 74    with tempfile.NamedTemporaryFile(delete = False) as outputFile:
 75        outputFile.write(fileContents)
 76        tempFileName = outputFile.name
 77    shutil.copy(tempFileName, fileThing)
 78    os.unlink(tempFileName)
 79
 80
 81def skipOverFS2MetadataRowsIn(data: pd.DataFrame) -> pd.DataFrame:
 82    """
 83    Returns a clean dataframe on which any metadata rows within the first 100
 84    are skipped.  This function uses the `time` column to detect valid rows.  A
 85    `time == NaN` is considered invalid and skipped.
 86
 87    Arguments
 88    ---------
 89        data
 90    A FlySight 2 dataframe suspected of having dirty N first rows with metadata
 91
 92    Returns
 93    -------
 94    A FlySight 2 clean dataframe without any leading metadata rows.
 95    """
 96    for ref in range(0,100):
 97        if pd.notnull(data.iloc[ref].time):
 98            break
 99    return data.iloc[ref:]
100
101
102def validFlySightHeaderIn(fileThingCSV) -> bool:
103    """
104    Checks if a file is a CSV in FlySight 1 or FlySight 2 formats.  The checks
105    include:
106
107    - Whether the file is a CSV, using a comma delimiter
108    - Checks for the presence of all the documented FlySight 1 headers
109    - Checks for the presence of the FlySight 2 line 1 identifier
110
111    Arguments
112    ---------
113        fileThingCSV
114    A file thing to verify as a valid FlySight file; can be a string, an
115    instance of `libpath.Path`, or a buffer of `bytes`.
116
117    Returns
118    -------
119    `True` if `fileThingCSV` is a FlySight CSV file, otherwise `False`.
120    """
121    delimiters =  [',', ]
122    hasAllHeaders = False
123    stream = None
124    if isinstance(fileThingCSV, bytes):
125        stream = StringIO(fileThingCSV.decode(FLYSIGHT_FILE_ENCODING))
126    else:
127        stream = open(fileThingCSV, 'r')
128
129    try:
130        dialect = csv.Sniffer().sniff(stream.readline(), delimiters = delimiters)
131    except:
132        return False
133    if dialect.delimiter in delimiters:
134        stream.seek(0)
135        header = next(csv.reader(stream))
136    else:
137        return False
138    hasAllHeaders = True if header[0] == '$FLYS' else FLYSIGHT_1_HEADER.issubset(header)
139    return hasAllHeaders
140
141
142def getAllSpeedJumpFilesFrom(dataLake: Path) -> dict:
143    """
144    Get a list of all the speed jump files from a data lake, where data lake is
145    defined as a reachable path that contains one or more FlySight CSV files.
146    This function tests each file to ensure that it's a speed skydive FlySight
147    file in a valid format and length.  It doesn't validate data like versions
148    prior to 1.9.0.
149
150    Arguments
151    ---------
152        dataLake: str
153    A valid (absolute or relative) path name to the top level directory where
154    the data lake starts.
155
156    Returns
157    -------
158    A dictionary of speed jump file names for later SSScoring processing:
159        - keys are the file names
160        - values are a FlySight version string tag
161    """
162    jumpFiles = OrderedDict()
163    for root, dirs, files in os.walk(dataLake):
164        if any(name in root for name in IGNORE_LIST):
165            continue
166        for fileName in files:
167            data = None
168            if '.swp' in fileName: # Ignore Vim, other editors swap file
169                continue
170            if '.CSV' in fileName.upper():
171                version = '1'
172                jumpFileName = Path(root) / fileName
173                stat = os.stat(jumpFileName)
174                if all(x not in fileName for x in ('EVENT', 'SENSOR', 'TRACK')):
175                    # FlySight 1 track format
176                    data = pd.read_csv(jumpFileName, skiprows = (1, 1), index_col = False)
177                elif 'TRACK' in fileName:
178                    # FlySight 2 track custom format
179                    data = pd.read_csv(jumpFileName, names = FLYSIGHT_2_HEADER, skiprows = 6, index_col = False, na_values = ['NA', ], dtype={ 'hMSL': float, })
180                    data = skipOverFS2MetadataRowsIn(data)
181                    data.drop('GNSS', inplace = True, axis = 1)
182                    version = '2'
183                if data is not None and stat.st_size >= MIN_JUMP_FILE_SIZE and validFlySightHeaderIn(jumpFileName):
184                    # explicit because `not data` is ambiguous for dataframes
185                    jumpFiles[jumpFileName] = version
186    jumpFiles = OrderedDict(sorted(jumpFiles.items()))
187    return jumpFiles
188
189
190def detectFlySightFileVersionOf(fileThing) -> FlySightVersion:
191    """
192    Detects the FlySight file version based on its file name and format.
193
194    Arguments
195    ---------
196        fileThing
197    A string, `bytes` buffer or `pathlib.Path` object corresponding to track
198    file.  If string or `pathlib.Path`, it'll be treated as a file.
199
200    Returns
201    -------
202    An instance of `ssscoring.flysight.FlySightVersion` with a valid version
203    symbolic value.
204
205    Errors
206    ------
207    `ssscoring.errors.SSScoringError` if the file is not a CSV and it's some
208    other invalid format.
209    """
210    if isinstance(fileThing, Path):
211        fileName = fileThing.as_posix()
212    elif isinstance(fileThing, str):
213        fileName = fileThing
214        fileThing = Path(fileThing)
215    elif isinstance(fileThing, bytes):
216        fileName = '00-00-00.CSV'
217
218    delimiters =  [',', ]
219    stream = None
220    if not '.CSV' in fileName.upper():
221        raise SSScoringError('Invalid file extension type')
222    if any(x in fileName for x in ('EVENT.CSV', 'SENSOR.CSV')):
223        raise SSScoringError('Only TRACK.CSV v2 files can be processed at this time')
224    if isinstance(fileThing, Path) or isinstance(fileThing, str):
225        if not fileThing.is_file():
226            raise SSScoringError('%s - file not found in data lake' % fileName)
227        if not validFlySightHeaderIn(fileName):
228            raise SSScoringError('CSV is not a valid FlySight file')
229        stream = open(fileName, 'r')
230    elif isinstance(fileThing, bytes):
231        stream = StringIO(fileThing.decode(FLYSIGHT_FILE_ENCODING))
232
233    try:
234        dialect = csv.Sniffer().sniff(stream.readline(), delimiters = delimiters)
235    except:
236        raise SSScoringError('Error while trying to validate %s file format' % fileName)
237    if dialect.delimiter in delimiters:
238        stream.seek(0)
239        header = next(csv.reader(stream))
240    else:
241        raise SSScoringError('CSV uses a different delimiter from FlySigh')
242    if header[0] == '$FLYS':
243        return FlySightVersion.V2
244    elif FLYSIGHT_1_HEADER.issubset(header):
245        return FlySightVersion.V1
246    else:
247        raise SSScoringError('%s file is not a FlySight v1 or v2 file')
def isCRMangledCSV(fileThing) -> bool:
34def isCRMangledCSV(fileThing) -> bool:
35    """
36    Tests if `fileThing` is an Excel or Dropbox DOS file with lines terminated
37    in CRCRLF.  These occur when someone opens the file with Excel or some other
38    tool in a Windows system and saves the file back to the file system,
39    mangling the original format.
40
41    Arguments
42    ---------
43        fileThing
44    A string or `pathlib.Path` object associated with what looks like a FlySight
45    CR mangled file.
46
47    Returns
48    -------
49    `True` if the file has one or more lines ending in CRCRLF within the first
50    512 bytes of data.
51    """
52    with open (fileThing, 'rb') as file:
53        rawData = file.read()
54        return b'\r\r\n' in rawData

Tests if fileThing is an Excel or Dropbox DOS file with lines terminated in CRCRLF. These occur when someone opens the file with Excel or some other tool in a Windows system and saves the file back to the file system, mangling the original format.

Arguments

fileThing

A string or pathlib.Path object associated with what looks like a FlySight CR mangled file.

Returns

True if the file has one or more lines ending in CRCRLF within the first 512 bytes of data.

def fixCRMangledCSV(fileThing):
57def fixCRMangledCSV(fileThing):
58    """
59    Open the file associated with `fileThing` and repleace all`\r\r\b` with
60    `\r\n` EOL markers.
61
62    Arguments
63    ---------
64        fileThing
65    A string or `pathlib.Path` object associated with what looks like a FlySight
66    CR mangled file.
67
68    See
69    ---
70    `ssscoring.flysight.isCRMangledCSV`
71    """
72    with open(fileThing, 'rb') as inputFile:
73        fileContents = inputFile.read()
74    fileContents = fileContents.replace(b'\r\r\n', b'\r\n')
75    with tempfile.NamedTemporaryFile(delete = False) as outputFile:
76        outputFile.write(fileContents)
77        tempFileName = outputFile.name
78    shutil.copy(tempFileName, fileThing)
79    os.unlink(tempFileName)

Open the file associated with fileThing and repleace all`

with ` EOL markers.

Arguments
---------
    fileThing
A string or `pathlib.Path` object associated with what looks like a FlySight
CR mangled file.

See
---
`ssscoring.flysight.isCRMangledCSV`
def skipOverFS2MetadataRowsIn(data: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame:
 82def skipOverFS2MetadataRowsIn(data: pd.DataFrame) -> pd.DataFrame:
 83    """
 84    Returns a clean dataframe on which any metadata rows within the first 100
 85    are skipped.  This function uses the `time` column to detect valid rows.  A
 86    `time == NaN` is considered invalid and skipped.
 87
 88    Arguments
 89    ---------
 90        data
 91    A FlySight 2 dataframe suspected of having dirty N first rows with metadata
 92
 93    Returns
 94    -------
 95    A FlySight 2 clean dataframe without any leading metadata rows.
 96    """
 97    for ref in range(0,100):
 98        if pd.notnull(data.iloc[ref].time):
 99            break
100    return data.iloc[ref:]

Returns a clean dataframe on which any metadata rows within the first 100 are skipped. This function uses the time column to detect valid rows. A time == NaN is considered invalid and skipped.

Arguments

data

A FlySight 2 dataframe suspected of having dirty N first rows with metadata

Returns

A FlySight 2 clean dataframe without any leading metadata rows.

def validFlySightHeaderIn(fileThingCSV) -> bool:
103def validFlySightHeaderIn(fileThingCSV) -> bool:
104    """
105    Checks if a file is a CSV in FlySight 1 or FlySight 2 formats.  The checks
106    include:
107
108    - Whether the file is a CSV, using a comma delimiter
109    - Checks for the presence of all the documented FlySight 1 headers
110    - Checks for the presence of the FlySight 2 line 1 identifier
111
112    Arguments
113    ---------
114        fileThingCSV
115    A file thing to verify as a valid FlySight file; can be a string, an
116    instance of `libpath.Path`, or a buffer of `bytes`.
117
118    Returns
119    -------
120    `True` if `fileThingCSV` is a FlySight CSV file, otherwise `False`.
121    """
122    delimiters =  [',', ]
123    hasAllHeaders = False
124    stream = None
125    if isinstance(fileThingCSV, bytes):
126        stream = StringIO(fileThingCSV.decode(FLYSIGHT_FILE_ENCODING))
127    else:
128        stream = open(fileThingCSV, 'r')
129
130    try:
131        dialect = csv.Sniffer().sniff(stream.readline(), delimiters = delimiters)
132    except:
133        return False
134    if dialect.delimiter in delimiters:
135        stream.seek(0)
136        header = next(csv.reader(stream))
137    else:
138        return False
139    hasAllHeaders = True if header[0] == '$FLYS' else FLYSIGHT_1_HEADER.issubset(header)
140    return hasAllHeaders

Checks if a file is a CSV in FlySight 1 or FlySight 2 formats. The checks include:

  • Whether the file is a CSV, using a comma delimiter
  • Checks for the presence of all the documented FlySight 1 headers
  • Checks for the presence of the FlySight 2 line 1 identifier

Arguments

fileThingCSV

A file thing to verify as a valid FlySight file; can be a string, an instance of libpath.Path, or a buffer of bytes.

Returns

True if fileThingCSV is a FlySight CSV file, otherwise False.

def getAllSpeedJumpFilesFrom(dataLake: pathlib.Path) -> dict:
143def getAllSpeedJumpFilesFrom(dataLake: Path) -> dict:
144    """
145    Get a list of all the speed jump files from a data lake, where data lake is
146    defined as a reachable path that contains one or more FlySight CSV files.
147    This function tests each file to ensure that it's a speed skydive FlySight
148    file in a valid format and length.  It doesn't validate data like versions
149    prior to 1.9.0.
150
151    Arguments
152    ---------
153        dataLake: str
154    A valid (absolute or relative) path name to the top level directory where
155    the data lake starts.
156
157    Returns
158    -------
159    A dictionary of speed jump file names for later SSScoring processing:
160        - keys are the file names
161        - values are a FlySight version string tag
162    """
163    jumpFiles = OrderedDict()
164    for root, dirs, files in os.walk(dataLake):
165        if any(name in root for name in IGNORE_LIST):
166            continue
167        for fileName in files:
168            data = None
169            if '.swp' in fileName: # Ignore Vim, other editors swap file
170                continue
171            if '.CSV' in fileName.upper():
172                version = '1'
173                jumpFileName = Path(root) / fileName
174                stat = os.stat(jumpFileName)
175                if all(x not in fileName for x in ('EVENT', 'SENSOR', 'TRACK')):
176                    # FlySight 1 track format
177                    data = pd.read_csv(jumpFileName, skiprows = (1, 1), index_col = False)
178                elif 'TRACK' in fileName:
179                    # FlySight 2 track custom format
180                    data = pd.read_csv(jumpFileName, names = FLYSIGHT_2_HEADER, skiprows = 6, index_col = False, na_values = ['NA', ], dtype={ 'hMSL': float, })
181                    data = skipOverFS2MetadataRowsIn(data)
182                    data.drop('GNSS', inplace = True, axis = 1)
183                    version = '2'
184                if data is not None and stat.st_size >= MIN_JUMP_FILE_SIZE and validFlySightHeaderIn(jumpFileName):
185                    # explicit because `not data` is ambiguous for dataframes
186                    jumpFiles[jumpFileName] = version
187    jumpFiles = OrderedDict(sorted(jumpFiles.items()))
188    return jumpFiles

Get a list of all the speed jump files from a data lake, where data lake is defined as a reachable path that contains one or more FlySight CSV files. This function tests each file to ensure that it's a speed skydive FlySight file in a valid format and length. It doesn't validate data like versions prior to 1.9.0.

Arguments

dataLake: str

A valid (absolute or relative) path name to the top level directory where the data lake starts.

Returns

A dictionary of speed jump file names for later SSScoring processing: - keys are the file names - values are a FlySight version string tag

def detectFlySightFileVersionOf(fileThing) -> ssscoring.datatypes.FlySightVersion:
191def detectFlySightFileVersionOf(fileThing) -> FlySightVersion:
192    """
193    Detects the FlySight file version based on its file name and format.
194
195    Arguments
196    ---------
197        fileThing
198    A string, `bytes` buffer or `pathlib.Path` object corresponding to track
199    file.  If string or `pathlib.Path`, it'll be treated as a file.
200
201    Returns
202    -------
203    An instance of `ssscoring.flysight.FlySightVersion` with a valid version
204    symbolic value.
205
206    Errors
207    ------
208    `ssscoring.errors.SSScoringError` if the file is not a CSV and it's some
209    other invalid format.
210    """
211    if isinstance(fileThing, Path):
212        fileName = fileThing.as_posix()
213    elif isinstance(fileThing, str):
214        fileName = fileThing
215        fileThing = Path(fileThing)
216    elif isinstance(fileThing, bytes):
217        fileName = '00-00-00.CSV'
218
219    delimiters =  [',', ]
220    stream = None
221    if not '.CSV' in fileName.upper():
222        raise SSScoringError('Invalid file extension type')
223    if any(x in fileName for x in ('EVENT.CSV', 'SENSOR.CSV')):
224        raise SSScoringError('Only TRACK.CSV v2 files can be processed at this time')
225    if isinstance(fileThing, Path) or isinstance(fileThing, str):
226        if not fileThing.is_file():
227            raise SSScoringError('%s - file not found in data lake' % fileName)
228        if not validFlySightHeaderIn(fileName):
229            raise SSScoringError('CSV is not a valid FlySight file')
230        stream = open(fileName, 'r')
231    elif isinstance(fileThing, bytes):
232        stream = StringIO(fileThing.decode(FLYSIGHT_FILE_ENCODING))
233
234    try:
235        dialect = csv.Sniffer().sniff(stream.readline(), delimiters = delimiters)
236    except:
237        raise SSScoringError('Error while trying to validate %s file format' % fileName)
238    if dialect.delimiter in delimiters:
239        stream.seek(0)
240        header = next(csv.reader(stream))
241    else:
242        raise SSScoringError('CSV uses a different delimiter from FlySigh')
243    if header[0] == '$FLYS':
244        return FlySightVersion.V2
245    elif FLYSIGHT_1_HEADER.issubset(header):
246        return FlySightVersion.V1
247    else:
248        raise SSScoringError('%s file is not a FlySight v1 or v2 file')

Detects the FlySight file version based on its file name and format.

Arguments

fileThing

A string, bytes buffer or pathlib.Path object corresponding to track file. If string or pathlib.Path, it'll be treated as a file.

Returns

An instance of ssscoring.flysight.FlySightVersion with a valid version symbolic value.

Errors

ssscoring.errors.SSScoringError if the file is not a CSV and it's some other invalid format.