Model/playground.py
2026-05-26 16:18:26 +00:00

57 lines
1.6 KiB
Python

"""Read a file and return unique values from a chosen column."""
from pathlib import Path
import argparse
import sys
import pandas as pd
def read_file(path: str | Path) -> pd.DataFrame:
path = Path(path)
suffix = path.suffix.lower()
if suffix == ".csv":
return pd.read_csv(path)
if suffix == ".tsv":
return pd.read_csv(path, sep="\t")
if suffix in {".xlsx", ".xls"}:
return pd.read_excel(path)
if suffix == ".parquet":
return pd.read_parquet(path)
if suffix == ".json":
return pd.read_json(path)
raise ValueError(f"Unsupported file type: {suffix}")
def get_unique(path: str | Path, column: str, dropna: bool = True) -> list:
df = read_file(Path(path))
if column not in df.columns:
raise KeyError(f"Column {column!r} not found. Available: {list(df.columns)}")
series = df[column].dropna() if dropna else df[column]
return series.unique().tolist()
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--path", default="/workspaces/model/certificates-2026.csv")
parser.add_argument("--column", nargs="walls_description")
parser.add_argument("--keep-na", action="store_true")
args, _ = parser.parse_known_args()
df = read_file(args.path)
if not args.column:
print("Available columns:")
for c in df.columns:
print(f" - {c}")
return 0
column = "wall "
series = df[column] if args.keep_na else df[column].dropna()
for value in series.unique():
print(value)
return 0
if __name__ == "__main__":
sys.exit(main())