Replies: 4 comments
-
The dialect sniffing procedure def sniff(self) ->Union[Dialect, None]:
self.validate
#Initialize Dialects and scores
dialects = p_dialects.get_dialects(p_dialects(self.delimiter_list,self.quotechar_list))
n = len(dialects)
scores =[0]*n
#Compute scores
j = 0
for d in dialects:
t_scoring = t_score(csv_path=self.file_path,
dialect=d,
threshold=self.threshold,
encoding=self.encoding)
try:
scores[j] = t_scoring.compute()
except:
scores[j] = 0
j += 1
return get_best_dialect(scores, dialects)
def get_best_dialect(
scores_: List[float],
dialects_: List[Dialect]
)->Union[Dialect, None]:
max_score = max(scores_)
if max_score > 0:
return dialects_[scores_.index(max_score)]
else:
return None |
Beta Was this translation helpful? Give feedback.
0 replies
-
The table score procedure def compute(self) -> float:
self.validate
#Initialize table object
table_obj = table_constructor(file_path=self.csv_path, threshold=self.threshold, encoding=self.encoding)
"""
Construct a sample table from CSV. Lines starting with '#' will be skiped.
"""
sample = table_obj.fromCSV(_dialect=self.dialect)
#Compute record score by infering fields data type
detector = type_detector()
record_score = detector.record_score(data=sample, dialect=self.dialect)
#Compute Table Uniformity parameters
_uniformity = t_uniformity(table=sample)
tau = _uniformity.compute()
n = len(sample)
if n > 1:
gamma = tau[0] / self.threshold + (1 / (tau[1] + n))
else:
eta = math.sqrt(record_score) / 10
k = len(sample[0])
gamma = (eta + (1 / k)) / (k - math.floor(eta * k) + 1)
return gamma * record_score |
Beta Was this translation helpful? Give feedback.
0 replies
-
The Table Uniformity procedure def avg_fields(self) -> float:
nk = 0
for record in self.table:
nk += len(record)
return nk / len(self.table)
def compute(self) -> List[float]:
self.validate()
phi = self.avg_fields()
mu, c, sm, alpha, beta = 0, 0, 0, 0, 0
n = len(self.table) #Number of records
for i in range (0, n):
k_i = len(self.table[i]) #Number of fields in current record
mu += math.pow(k_i - phi, 2) #Deviations
if i == 0:
c += 1
k_max = k_i
k_min = k_i
if n== 1:
sm = c
else:
if k_i > k_max:
k_max = k_i
if k_i < k_min:
k_min = k_i
k_prev = len(self.table[i - 1])
if k_prev != k_i:
alpha += 1
if c > sm:
sm = c #Segmented mode
c = 0
else:
c += 1
if i == n - 1:
if c > sm:
sm = c
if n > 1:
sigma = math.sqrt(mu / (n - 1)) #Standard deviation
else:
sigma = math.sqrt(mu / n)
tau_0 = 1 / (1 + 2 * sigma) #Consistency factor
r = k_max - k_min #Range of fields
if alpha > 0:
beta = sm / n #Records variability factor
tau_1 = 2 * r * (math.pow(alpha, 2) + 1) * ((1 - beta) / sm) #Records dispersion
return [tau_0, tau_1] |
Beta Was this translation helpful? Give feedback.
0 replies
-
The record score procedure def record_score(self,
data: List[List],
dialect: Dialect
) -> float:
"""
Computes the record score as the sum of detected cells data-type.
Parameters
----------
data: List[List[str]]
the data as a List of String List
dialect: Dialect
the dialect used to read the file
Returns
-------
record_score: float
"""
try:
td = type_detector()
TotalScore = 0
for record in data:
tmpSum = 0
k = 0
for field in record:
k += 1
if td.is_known_type(trip_quotes(field, dialect)):
tmpSum += 100
else:
tmpSum += 0.1
TotalScore += math.pow(tmpSum, 2)/(100 * math.pow(k, 2))
return TotalScore
except:
pass
def trip_quotes(
field: str,
dialect: Dialect
) -> str:
if field != '':
#Care and removal of spaces at the beginning of fields
field = field.strip()
return trip_ends(field, dialect) if field != '' else field
else:
return field
def trip_ends(
field: str,
dialect: Dialect
) -> str: |
Beta Was this translation helpful? Give feedback.
0 replies
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
-
Problem overview
Currently, this project does not have a stable alternative that allows detecting CSV file configuration. An example of this is raised in #1719, where the utility fails to detect the configuration for the given files.
Details
At the moment, @jqnatividad has begun digging into the problem and claiming
He pointed
The work path to go, until now, is outlined in jqnatividad/qsv-sniffer#14. Currently, all tasks are under study but not completed.
New path
In this I will discuss a new approach to implement dialect detection in qsv using trivial elements:
With this approach the dialect detection is reliable as the CleverCSV one, being able to obtain results with greater certainty. The process is as follows:
A Python implementation of this exact approach is described in a GitHub repository. The evaluation of this methods gives:
CSVsniffer
CleverCSV
csv.Sniffer
This sheds light over one point: the presented approach is clearly outperforming
csv.Sniffer
and alsoCleverCSV
in the research datasets.Hoping this can help this wonderful project!
Beta Was this translation helpful? Give feedback.
All reactions