import pandas as pd
from pathlib import Path
from typing import Union
def get_directory() -> str:
"""获取目录"""
while True:
directory: str = input("请输入要搜索的目录:")
if not Path(directory).exists():
print("目录不存在,请重新输入要搜索的目录:")
else:
return directory
def get_selected_file(excel_files):
"""获取用户选择的 Excel 文件"""
print("可用 Excel 文件:")
for i, file in enumerate(excel_files):
print(f"{i+1}. {file}")
while True:
selection = input("请选择要处理的 Excel 文件(输入编号):")
try:
file_index = int(selection) - 1
selected_file = excel_files[file_index]
full_path = Path(selected_file).resolve()
print(f"您选择的 Excel 文件是:{full_path}")
return full_path
except (ValueError, IndexError):
print("无效的选择,请重新输入。")
def get_column_name(all_columns):
"""获取要去重的列名"""
while True:
select_col = input("请输入要去重的列名:")
if select_col not in all_columns:
print("无效的列名,请重新输入。")
else:
column_name = select_col
return column_name
def drop_duplicates(df, column_name):
"""去除重复值"""
duplicates = df[df.duplicated(subset=[column_name], keep="first")]
numbers = duplicates.index.values.tolist()
print(f"重复值的行号:\n{numbers}")
print(f"重复次数:{len(duplicates)}")
unique_rows = df.drop_duplicates(subset=[column_name], keep="first")
print(f"去除重复值后的行数:{len(unique_rows)}")
return unique_rows
def save_to_excel(directory, unique_rows):
"""保存去重后的 Excel 文件"""
user_input = input("是否需要删除重复数据?(y/n):").lower()
if user_input == "y":
new_file_path = Path(directory) / "new_file.xlsx"
unique_rows.reset_index(drop=True, inplace=True)
unique_rows.to_excel(new_file_path, index=False)
print(f"已删除重复数据,不重复数据已保存到:{new_file_path}")
else:
print("未删除重复数据。")
def main():
directory = get_directory()
excel_files = list(Path(directory).glob("*.xlsx"))
selected_file = get_selected_file(excel_files)
df = pd.read_excel(selected_file, sheet_name=0)
all_columns = set(df.columns)
print("Excel 中所有列:")
for col in all_columns:
print(col)
column_name = get_column_name(all_columns)
unique_rows = drop_duplicates(df, column_name)
save_to_excel(directory, unique_rows)
if __name__ == "__main__":
main()