Kreol84cav
Вот поменял кое-что, пробуйте (1.pdf файл все равно не обрабатывается).
[more=скрипт]#Include <Array.au3>
$search = FileFindFirstFile("*.pdf")
If $search = -1 Then
MsgBox(0, "Error", "No files/directories matched the search pattern")
Exit
EndIf
$i=0
Dim $file[1]
;_ArrayDisplay($file, "$avArray AFTER _ArrayAdd()")
While 1
$file_n=FileFindNextFile($search)
If @error Then ExitLoop
_ArrayAdd($file, $file_n )
; _ArrayDisplay($file, "$avArray AFTER _ArrayAdd()")
$i=$i+1
WEnd
FileClose($search)
;_ArrayDisplay($file, "$avArray AFTER _ArrayAdd()")
For $j=1 To $i
RunWait("pdftohtml.exe -f 2 -l 2 -hidden -xml -enc UTF-8 "&$file[$j]&" out",'',@SW_HIDE )
$out = FileOpen("out.xml", 0); find max font size
If $out = -1 Then
MsgBox(0, "Error", "Unable to open xml file.")
Exit
EndIf
$max_font=1
While 1
$line = FileReadLine($out)
If @error = -1 Then ExitLoop
$line=StringRegExpReplace($line,'<text.*height="','')
$line=StringRegExpReplace($line,'" font.*','')
If StringIsInt($line) Then
$font=Number($line)
if $font>$max_font Then $max_font=$font
EndIf
Wend
FileClose($out)
$out = FileOpen("out.xml", 0); find name document
If $out = -1 Then
MsgBox(0, "Error", "Unable to open xml file.")
Exit
EndIf
$name=''
While 1
$line = FileReadLine($out)
If @error = -1 Then ExitLoop
If StringRegExp($line,'<text.*height="'&$max_font) Then
$temp=StringRegExpReplace($line,'<text.*">','')
$temp=StringRegExpReplace($temp,'</text>','')
$name=$name&' '&$temp
EndIf
Wend
FileClose($out)
$name=StringStripWS ( $name, 1)
$name=StringReplace($name,'<b>','')
$name=StringReplace($name,'</b>','')
$name=StringReplace($name,'i>','')
$name=StringReplace($name,'</i>','')
$name=StringStripWS ( $name, 3 )
$name=StringStripCR ( $name)
$name=_Utf8ToAnsi($name)
;$name=_Utf8ToUnicode($name)
;MsgBox(0, "Error", $name)
;MsgBox(0, $name, $file[$j])
If $name<>'' Then FileCopy ( $file[$j], $name&'.pdf' , 1 )
Next
Func _Utf8ToAnsi($utf8string); (c) ViSiToR
Local $len = StringLen($utf8string)
Local $buf = DllStructCreate("byte[" & $len*2 & "];byte[2]")
Local $ret = DllCall("Kernel32.dll", "int", "MultiByteToWideChar", _
"int", 65001, "int", 0, _
"str", $utf8string, "int", -1, _
"ptr", DllStructGetPtr($buf), "int", $len*2+2)
Local $out = DllStructCreate("char[" & $len & "];char")
$ret = DllCall("kernel32.dll", "int", "WideCharToMultiByte", _
"int", 0, "int", 0, _
"ptr", DllStructGetPtr($buf), "int", -1, _
"ptr", DllStructGetPtr($out), "int", $len+1, _
"int", 0, "int", 0)
Return DllStructGetData($out,1)
EndFunc
Func _Utf8ToUnicode($Utf8String) ; (c) ViSiToR
Local $BufferSize = StringLen($Utf8String) * 2
Local $Buffer = DllStructCreate("byte[" & $BufferSize & "]")
Local $Return = DllCall("Kernel32.dll", "int", "MultiByteToWideChar", _
"int", 65001, _
"int", 0, _
"str", $Utf8String, _
"int", StringLen($Utf8String), _
"ptr", DllStructGetPtr($Buffer), _
"int", $BufferSize)
Local $UnicodeBinary = DllStructGetData($Buffer, 1)
$UnicodeHex1 = StringReplace($UnicodeBinary, "0x", "")
$StrLen = StringLen($UnicodeHex1)
Local $UnicodeString
For $i = 1 To $StrLen Step 4
$UnicodeHex2 = StringMid($UnicodeHex1, $i, 4)
$UnicodeHex3 = StringMid($UnicodeHex2,3,2) & StringMid($UnicodeHex2,1,2)
$UnicodeString &= ChrW(Dec($UnicodeHex3))
Next
$Buffer = 0
Return $UnicodeString
EndFunc [/more]
Вот поменял кое-что, пробуйте (1.pdf файл все равно не обрабатывается).
[more=скрипт]#Include <Array.au3>
$search = FileFindFirstFile("*.pdf")
If $search = -1 Then
MsgBox(0, "Error", "No files/directories matched the search pattern")
Exit
EndIf
$i=0
Dim $file[1]
;_ArrayDisplay($file, "$avArray AFTER _ArrayAdd()")
While 1
$file_n=FileFindNextFile($search)
If @error Then ExitLoop
_ArrayAdd($file, $file_n )
; _ArrayDisplay($file, "$avArray AFTER _ArrayAdd()")
$i=$i+1
WEnd
FileClose($search)
;_ArrayDisplay($file, "$avArray AFTER _ArrayAdd()")
For $j=1 To $i
RunWait("pdftohtml.exe -f 2 -l 2 -hidden -xml -enc UTF-8 "&$file[$j]&" out",'',@SW_HIDE )
$out = FileOpen("out.xml", 0); find max font size
If $out = -1 Then
MsgBox(0, "Error", "Unable to open xml file.")
Exit
EndIf
$max_font=1
While 1
$line = FileReadLine($out)
If @error = -1 Then ExitLoop
$line=StringRegExpReplace($line,'<text.*height="','')
$line=StringRegExpReplace($line,'" font.*','')
If StringIsInt($line) Then
$font=Number($line)
if $font>$max_font Then $max_font=$font
EndIf
Wend
FileClose($out)
$out = FileOpen("out.xml", 0); find name document
If $out = -1 Then
MsgBox(0, "Error", "Unable to open xml file.")
Exit
EndIf
$name=''
While 1
$line = FileReadLine($out)
If @error = -1 Then ExitLoop
If StringRegExp($line,'<text.*height="'&$max_font) Then
$temp=StringRegExpReplace($line,'<text.*">','')
$temp=StringRegExpReplace($temp,'</text>','')
$name=$name&' '&$temp
EndIf
Wend
FileClose($out)
$name=StringStripWS ( $name, 1)
$name=StringReplace($name,'<b>','')
$name=StringReplace($name,'</b>','')
$name=StringReplace($name,'i>','')
$name=StringReplace($name,'</i>','')
$name=StringStripWS ( $name, 3 )
$name=StringStripCR ( $name)
$name=_Utf8ToAnsi($name)
;$name=_Utf8ToUnicode($name)
;MsgBox(0, "Error", $name)
;MsgBox(0, $name, $file[$j])
If $name<>'' Then FileCopy ( $file[$j], $name&'.pdf' , 1 )
Next
Func _Utf8ToAnsi($utf8string); (c) ViSiToR
Local $len = StringLen($utf8string)
Local $buf = DllStructCreate("byte[" & $len*2 & "];byte[2]")
Local $ret = DllCall("Kernel32.dll", "int", "MultiByteToWideChar", _
"int", 65001, "int", 0, _
"str", $utf8string, "int", -1, _
"ptr", DllStructGetPtr($buf), "int", $len*2+2)
Local $out = DllStructCreate("char[" & $len & "];char")
$ret = DllCall("kernel32.dll", "int", "WideCharToMultiByte", _
"int", 0, "int", 0, _
"ptr", DllStructGetPtr($buf), "int", -1, _
"ptr", DllStructGetPtr($out), "int", $len+1, _
"int", 0, "int", 0)
Return DllStructGetData($out,1)
EndFunc
Func _Utf8ToUnicode($Utf8String) ; (c) ViSiToR
Local $BufferSize = StringLen($Utf8String) * 2
Local $Buffer = DllStructCreate("byte[" & $BufferSize & "]")
Local $Return = DllCall("Kernel32.dll", "int", "MultiByteToWideChar", _
"int", 65001, _
"int", 0, _
"str", $Utf8String, _
"int", StringLen($Utf8String), _
"ptr", DllStructGetPtr($Buffer), _
"int", $BufferSize)
Local $UnicodeBinary = DllStructGetData($Buffer, 1)
$UnicodeHex1 = StringReplace($UnicodeBinary, "0x", "")
$StrLen = StringLen($UnicodeHex1)
Local $UnicodeString
For $i = 1 To $StrLen Step 4
$UnicodeHex2 = StringMid($UnicodeHex1, $i, 4)
$UnicodeHex3 = StringMid($UnicodeHex2,3,2) & StringMid($UnicodeHex2,1,2)
$UnicodeString &= ChrW(Dec($UnicodeHex3))
Next
$Buffer = 0
Return $UnicodeString
EndFunc [/more]