from docx import Document as DocxDocument
from .Document import Document

class WordDocument(Document):
    def extract_content(self):
        doc = DocxDocument(self.path)
        text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
        text = self.clean_text(text)
        text = self.remove_repeated_lines(text)
        # return self.fix_text(text)
        images = []
        return text, images