数据打标签
#pip install gradio pandas json fitz os
import gradio as gr
import pandas as pd
import os
import json
import fitz # PyMuPDF
# Initialize global variables
label_data = {'data_type': [], 'pdf_name': [], 'Label1': [], 'Label2': [], 'Label3': [], 'Label4': [], 'Label5': [], 'Label6': []}
pdf_files = []
current_pdf_index = 0
current_page_index = 0
# Initialize Q&A list
qa_data = []
# Define functions for PDF operations
def load_pdfs(folder_path):
global pdf_files, current_pdf_index, current_page_index
try:
pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdf')]
current_pdf_index = 0
current_page_index = 0
return get_current_pdf_name()
except Exception as e:
return f"Error loading PDFs: {e}"
def get_current_pdf_name():
if pdf_files:
return os.path.basename(pdf_files[current_pdf_index])
else:
return "No PDF files found"
def read_pdf(page_num=0):
if pdf_files:
try:
pdf_document = fitz.open(pdf_files[current_pdf_index])
if 0 <= page_num < pdf_document.page_count:
page = pdf_document.load_page(page_num)
return page.get_text('text')
else:
return "Page number out of range"
except Exception as e:
return f"Error reading PDF: {e}"
return "No content"
def next_pdf():
global current_pdf_index, current_page_index
if pdf_files and current_pdf_index < len(pdf_files) - 1:
current_pdf_index += 1
current_page_index = 0 # Reset page index when switching PDFs
return get_current_pdf_name(), read_pdf(current_page_index)
def previous_pdf():
global current_pdf_index, current_page_index
if pdf_files and current_pdf_index > 0:
current_pdf_index -= 1
current_page_index = 0 # Reset page index when switching PDFs
return get_current_pdf_name(), read_pdf(current_page_index)
def next_page():
global current_page_index
current_page_index += 1
return read_pdf(current_page_index)
def previous_page():
global current_page_index
if current_page_index > 0:
current_page_index -= 1
return read_pdf(current_page_index)
def add_label(data_type, pdf_name, label1, label2, label3, label4, label5, label6):
if pdf_name in label_data['pdf_name']:
return f"PDF '{pdf_name}' has already been labeled"
label_data['data_type'].append(data_type)
label_data['pdf_name'].append(pdf_name)
label_data['Label1'].append(label1)
label_data['Label2'].append(label2)
label_data['Label3'].append(label3)
label_data['Label4'].append(label4)
label_data['Label5'].append(label5)
label_data['Label6'].append(label6)
total_labels = len(label_data['pdf_name'])
return f"Label added for {total_labels} PDF(s)"
def export_to_excel(output_folder, excel_name):
if not excel_name.endswith('.xlsx'):
excel_name += '.xlsx'
df = pd.DataFrame(label_data)
output_path = os.path.join(output_folder, excel_name)
try:
df.to_excel(output_path, index=False)
return f"File exported to {output_path}"
except Exception as e:
return f"Error exporting to Excel: {e}"
# Define new text processing operations
def count_words(text):
words = text.split()
return len(words)
def count_characters(text):
return len(text)
def add_qa_pair(question, answer):
qa_data.append({"instruction": question, "input": "", "output": answer})
return f"Q&A pair added. Total pairs: {len(qa_data)}"
def export_qa_to_json(output_dir, output_file):
if not output_file.endswith('.json'):
output_file += '.json'
output_path = os.path.join(output_dir, output_file)
try:
with open(output_path, 'w', encoding='utf-8') as file:
json.dump(qa_data, file, ensure_ascii=False, indent=4)
return f"Q&A pairs exported to {output_path}"
except Exception as e:
return f"Error exporting to JSON: {e}"
# Create Gradio interface
with gr.Blocks() as labelmaker:
gr.Markdown("### 🐭数据标注工具🐭")
with gr.Tab("PDF 标签标注"):
with gr.Row():
with gr.Column():
folder_input = gr.Textbox(label="PDF 文件夹路径")
load_button = gr.Button("加载 PDF 文件")
pdf_name_display = gr.Textbox(label="当前 PDF 文件名", interactive=False)
pdf_content_display = gr.Textbox(label="PDF 内容", interactive=False, lines=20)
previous_pdf_button = gr.Button("上一个 PDF")
next_pdf_button = gr.Button("下一个 PDF")
previous_page_button = gr.Button("上一页")
next_page_button = gr.Button("下一页")
with gr.Column():
data_type_input = gr.Textbox(label="文件类型")
pdf_name_input = pdf_name_display # 设置为非交互式,避免用户手动修改
label1_input = gr.Textbox(label="标签1")
label2_input = gr.Textbox(label="标签2")
label3_input = gr.Textbox(label="标签3")
label4_input = gr.Textbox(label="标签4")
label5_input = gr.Textbox(label="标签5")
label6_input = gr.Textbox(label="标签6")
add_button = gr.Button("添加标签")
output_folder_input = gr.Textbox(label="输出文件夹路径")
excel_name_input = gr.Textbox(label="保存的excel名称")
export_button = gr.Button("导出 Excel 文件")
add_output = gr.Textbox(label="输出消息")
export_output = gr.Textbox(label="导出消息")
def update_pdf_name_display():
current_pdf_name = get_current_pdf_name()
pdf_name_input.value = current_pdf_name
return current_pdf_name
load_button.click(load_pdfs, inputs=[folder_input], outputs=[pdf_name_display])
load_button.click(read_pdf, outputs=[pdf_content_display])
load_button.click(update_pdf_name_display, outputs=[pdf_name_display])
next_pdf_button.click(next_pdf, outputs=[pdf_name_display, pdf_content_display])
previous_pdf_button.click(previous_pdf, outputs=[pdf_name_display, pdf_content_display])
next_page_button.click(next_page, outputs=[pdf_content_display])
previous_page_button.click(previous_page, outputs=[pdf_content_display])
add_button.click(add_label, inputs=[data_type_input, pdf_name_display, label1_input, label2_input, label3_input, label4_input, label5_input, label6_input], outputs=add_output)
export_button.click(export_to_excel, inputs=[output_folder_input, excel_name_input], outputs=export_output)
with gr.Tab("Q&A 整合及格式化"):
with gr.Row():
with gr.Column(scale=2): # 占用2/3的宽度
question_input = gr.Textbox(label="输入问题")
answer_input = gr.Textbox(label="输入回答")
add_qa_button = gr.Button("添加 Q&A")
qa_count_display = gr.Textbox(label="Q&A 个数", interactive=False)
with gr.Column(scale=1): # 占用1/3的宽度
output_dir_input = gr.Textbox(label="输出文件夹路径")
output_file_input = gr.Textbox(label="输出文件名", value="(改成自己的文件名).json")
export_qa_button = gr.Button("导出 Q&A 到 JSON 文件")
export_qa_output = gr.Textbox(label="输出消息", interactive=False)
add_qa_button.click(add_qa_pair, inputs=[question_input, answer_input], outputs=qa_count_display)
export_qa_button.click(export_qa_to_json, inputs=[output_dir_input, output_file_input], outputs=[export_qa_output])
labelmaker.launch()
数据打标签
http://192.144.219.54/:8080//archives/1720491463217